#include "mat_mul.h" #include "util.h" #include #include #include //#define MASTER 0 //#define FROM_MASTER 1 //#define FROM_WORKER 2 #define TILE_M 64 #define TILE_K 16 #define TILE_N 2048 static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; static int rows[4] = {0,}; static int offset[4] = {0,}; static void mat_mul_omp() { int begin = 0; int end = rows[mpi_rank]; // TODO: parallelize & optimize matrix multiplication // Use num_threads per node #pragma omp parallel for num_threads(num_threads) schedule(dynamic) for (int ii = begin; ii < end; ii+= TILE_M) { for (int kk = 0; kk < K; kk+= TILE_K) { for (int jj = 0; jj