#include "mat_mul.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; #define FROM_MASTER 1 #define FROM_WORKER 2 #ifndef max #define max(a,b) (((a) > (b)) ? (a) : (b)) #endif #ifndef min #define min(a,b) (((a) < (b)) ? (a) : (b)) #endif void alloc_mat_local(float **m, int R, int C) { *m = (float *)aligned_alloc(32, sizeof(float) * R * C); if (*m == NULL) { printf("Failed to allocate memory for matrix.\n"); exit(0); } } void zero_mat_local(float *m, int R, int C) { memset(m, 0, sizeof(float) * R * C); } static void mat_mul_omp(int offset, int rows) { // TODO: parallelize & optimize matrix multiplication // Use num_threads per node #pragma omp parallel { int idx = omp_get_thread_num(); int slice = rows / num_threads; int start = offset + (idx * slice); int end = idx == num_threads - 1 ? offset+rows : offset + (idx + 1) * slice; float Aik; int iBS = 32; int jBS = 1024; int kBS = 1024; for (int kk = 0; kk < K; kk += iBS) { for (int jj = 0; jj < N; jj += jBS) { for(int ii=start;ii