#include "mat_mul.h" #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static int BLOCKSIZE_k=45; static int BLOCKSIZE_n=2048; static int UNROLL=4; #define MIN(a, b) (((a)<(b)) ? (a):(b)) static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication int pid = *(int*)data; int slice = M / num_threads; int start = pid * slice; int end = (pid == num_threads -1) ? M : (pid + 1) * slice; // C(M*N) = A(M*K) * B(K*N) int kBS = BLOCKSIZE_k; int nBS = BLOCKSIZE_n; for (int bk = 0; bk < K; bk += kBS) { for (int bn = 0; bn < N; bn += nBS) { for (int i = start; i < end; ++i) { //M is sliced by threads for (int k = bk; k < MIN(bk+kBS, K); ++k) { float a = A[i * K + k]; int j; // TODO: loop unroll correction int size = ((MIN(bn+nBS, N)-bn)/UNROLL)*UNROLL; //int size = (N/UNROLL)*UNROLL; //for (j = bn; j < bn+size/*MIN(bn+BS, N)*/; j+=UNROLL) { for (j = bn; j < bn+size/*MIN(bn+BS, N)*/; j+=UNROLL) { C[i * N + j] += a * B[k * N + j]; C[i * N + j + 1] += a * B[k * N + j + 1]; C[i * N + j + 2] += a * B[k * N + j + 2]; C[i * N + j + 3] += a * B[k * N + j + 3]; } for (; j < MIN(bn+nBS, N); j++) { //for (; j < MIN(bn+BS, N); j++) { C[i * N + j] += a * B[k * N + j]; } } } } } return NULL; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; // TODO: create '_num_threads' pthreads pthread_t* vth = (pthread_t*) malloc(sizeof(pthread_t) * (num_threads)); for (int rank = 0; rank < num_threads; rank++) { int *pid = (int*)malloc(sizeof(int)); *pid = rank; pthread_create(&vth[rank], NULL, mat_mul_thread, pid); } for (int rank = 0; rank < num_threads; rank++) { pthread_join(vth[rank], NULL); } /* pthread_t thread; pthread_create(&thread, NULL, mat_mul_thread, NULL); pthread_join(thread, NULL); */ }