#include "mat_mul.h" #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; inline int min(int a, int b) { if(a>b) return b; else return a; } pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication static int tid = 0; int num; pthread_mutex_lock(&lock); tid%=num_threads; tid++; num=tid; pthread_mutex_unlock(&lock); // printf("start: num = %d\n", num); int q, r; r = M % num_threads; q = M / num_threads; // if(num==1){ printf("q = %d, r = %d\n", q, r); } if(r){ q++; } int i, j, k; int kk, off_k; int jj, off_j; off_k = 64; off_j = 1024; float A_tmp; if(num == num_threads){ for(jj = 0; jj < N; jj += off_j){ for(kk = 0; kk < K; kk += off_k){ for(i = q * (num-1); i < min(q * num + r, M); ++i){ for(k = kk; k < min(kk+off_k, K); ++k){ A_tmp = A[i * K + k]; // #pragma GCC unroll 2 for(j = jj; j < min(jj+off_j, N); ++j){ C[i * N + j] += A_tmp * B[k * N + j]; } } } } } }else{ for(jj = 0; jj < N; jj += off_j){ for(kk = 0; kk < K; kk += off_k){ for(i = q * (num-1); i < min(q * num, M); ++i){ for(k = kk; k < min(kk+off_k, K); ++k){ A_tmp = A[i * K + k]; // #pragma GCC unroll 2 for(j = jj; j < min(jj+off_j, N); ++j){ C[i * N + j] += A_tmp * B[k * N + j]; } } } } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; // TODO: create '_num_threads' pthreads int i; pthread_t thread[num_threads]; for(i=0; i