#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static void* mat_mul_thread(void *data) { float blk_A; long tid; int loop_per_threads; int start; int end; int bs = 32; tid = (long)data; loop_per_threads= int(M/num_threads); start = loop_per_threads * tid; end = (tid == num_threads-1) ? M : (tid+1)*loop_per_threads; if(num_threads < M){ for (int kk=0; kk < K; kk+=bs) { for (int i= start; i < end; ++i) { for (int k=kk; k < std::min(kk+bs, K); ++k){ int j; blk_A = A[i * K + k]; for (j=0; j < N; ++j) { C[i * N + j] += blk_A * B[k * N + j]; } // ----------------------------------------------------------- // Loop Unrolling: Start // ----------------------------------------------------------- // int l; // for (j=0; j < N; j+=4) { // if ( N - j >= 4 ){ // C[i * N + j + 0] += blk_A * B[k * N + j + 0]; // C[i * N + j + 1] += blk_A * B[k * N + j + 1]; // C[i * N + j + 2] += blk_A * B[k * N + j + 2]; // C[i * N + j + 3] += blk_A * B[k * N + j + 3]; // } // } // l = j - 4; // if(N % 4 != 0){ // for (j=l; j < N; ++j) { // C[i * N + j] += blk_A * B[k * N + j]; // } // } // ----------------------------------------------------------- // Loop Unrolling: End // ----------------------------------------------------------- } } } } else{ if(tid == 0){ for (int i = 0; i < M; ++i){ for (int j = 0; j < N; ++j){ for (int k = 0; k < K; ++k){ C[i * N + j] += A[i * K + k] * B[k * N + j]; } } } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { num_threads = _num_threads; pthread_t thread[num_threads]; long t; A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; for(t = 0; t < num_threads; t++) pthread_create(&thread[t], NULL, mat_mul_thread, (void *)t); for(t = 0; t < num_threads; t++) pthread_join(thread[t], NULL); }