#include "mat_mul.h" #include "util.h" #include #include #include #include #include #define max(a,b) (((a) > (b)) ? (a) : (b)) #define min(a,b) (((a) < (b)) ? (a) : (b)) static float *A, *B, *C; //, *B_T; static int M, N, K; static int num_threads; #if 0 static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { for (int k = 0; k < K; ++k) { C[i * N + j] += A[i * K + k] * B[k * N + j]; } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; TODO: create '_num_threads' pthreads pthread_t thread; pthread_create(&thread, NULL, mat_mul_thread, NULL); pthread_join(thread, NULL); } #else static void* mat_mul_thread(void *data) { const int block_size = 64; int tid, blk_size; int row_start, row_end; tid = *(int *)(data); blk_size = M / num_threads; row_start = tid * blk_size; row_end = (tid == num_threads - 1) ? M : ((tid + 1) * blk_size); if ((K % 4) == 0) { float c0, c1, c2, c3; for (int kk = 0; kk < K; kk += block_size) { for (int jj = 0; jj < N; jj += block_size) { for (int i = row_start; i < row_end; i++) { int iK = i * K; for (int k = kk; k < min(kk + block_size, K); k += 4) { float Aik0 = A[iK + k]; float Aik1 = A[iK + k + 1]; float Aik2 = A[iK + k + 2]; float Aik3 = A[iK + k + 3]; int iN = i * N; int k0N = k * N; int k1N = (k + 1)* N; int k2N = (k + 2)* N; int k3N = (k + 3)* N; for (int j = jj; j < min(jj + block_size, N); j++) { c0 = Aik0 * B[k0N + j]; c1 = Aik1 * B[k1N + j]; c2 = Aik2 * B[k2N + j]; c3 = Aik3 * B[k3N + j]; C[iN + j] += (c0 + c1 + c2 + c3); } } } } } } else { for (int kk = 0; kk < K; kk += block_size) { for (int jj = 0; jj < N; jj += block_size) { for (int i = row_start; i < row_end; i++) { int iK = i * K; for (int k = kk; k < min(kk + block_size, K); k++) { float Aik = A[iK + k]; int iN = i * N; int kN = k * N; for (int j = jj; j < min(jj + block_size, N); j++) { C[iN + j] += Aik * B[kN + j]; } } } } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; pthread_t *thread; thread = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); for (int i = 0; i < num_threads; i++) { int *tid; tid = (int *)malloc( sizeof(int) ); *tid = i; pthread_create(&thread[i], NULL, mat_mul_thread, (void *)tid); } for (int i = 0; i < num_threads; i++) { pthread_join(thread[i], NULL); } } #endif