#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication // int pid = * (int *) data; int slice = M / num_threads; int start = pid * slice; int end = (pid == num_threads-1)? M: (pid+1)*slice; float Aik; for (int kk=0; kk< K; kk+=32){ for(int i=start; i