#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static int offset; #define min(a, b) (((a)<=(b)) ? (a) : (b)) static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication int64_t row = (int64_t)data; int bsize = 128; float c00, c10, c20, temp; int i_end = min(row + offset, M); int j_end, k_end; for (int jj = 0; jj < K; jj += bsize) { j_end = min(jj + bsize, K); for (int kk = 0; kk < N; kk += bsize) { k_end = min(kk + bsize, N); for (int i = row; i < i_end; ) { if (i+2 < min(row + offset, M)) { for (int j = jj; j < j_end; j++) { c00 = A[i * K + j]; c10 = A[(i+1) * K + j]; c20 = A[(i+2) * K + j]; for (int k = kk; k < k_end; k++) { temp = B[j * N + k]; C[i * N + k] += c00 * temp; C[(i+1) * N + k] += c10 * temp; C[(i+2) * N + k] += c20 * temp; } } i+=3; } else { for (int j = jj; j < j_end; j++) { c00 = A[i * K + j]; for (int k = kk; k < k_end; k++) { C[i * N + k] += c00 * B[j * N + k]; } } i+=1; } } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; // TODO: create '_num_threads' pthreads int64_t i, j; pthread_t threads[80]; if (M <= num_threads) { offset = 1; } else { i = M / num_threads; j = M % num_threads; if(!j) { offset = i; } else { offset = i+1; } } for (i = 0; i < num_threads; i++) { pthread_create(&threads[i], NULL, mat_mul_thread, (void *)(i * offset)); } for (i = 0; i< num_threads; i++) { pthread_join(threads[i], NULL); } // free(threads); }