#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication int pid = *(int *)data; int slice = M/num_threads; int start = pid*slice; int end = pid == num_threads -1 ? M: (pid+1)*slice; float Aik; int bs=32; if(start>=M) return NULL; for(int kprim=0;kprim