#include "mat_mul.h" #include #include #include #include #include #include #include #include #include "util.h" static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; static int rows[4] = {0,}; static int offset[4] = {0,}; int omp_get_thread_num(void); int omp_get_num_threads(void); static void mat_mul_omp() { // TODO: parallelize & optimize matrix multiplication // Use num_threads per node int start = 0; int end = rows[mpi_rank]; int TILE_M = 40, TILE_K =16, TILE_N =4096; int end_m, end_k, end_n; float temp; #pragma omp parallel for num_threads(num_threads) schedule(static) for(int ii=start; ii