#include "mat_mul.h" #include #include #include #define my_MASTER 0 #define FROM_MASTER 1 #define FROM_WORKER 2 #define ITILESIZE 32 #define JTILESIZE 1024 #define KTILESIZE 1024 static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; static void mat_mul_omp(int tid, int my_M) { int is = my_M / num_threads * tid + std::min(tid, my_M % num_threads); int ie = my_M / num_threads * (tid + 1) + std::min(tid + 1, my_M % num_threads); for(int ii = is; ii < ie; ii += ITILESIZE){ for(int jj = 0; jj < N; jj += JTILESIZE){ for(int kk = 0; kk < K; kk += KTILESIZE){ for(int k = kk; k < std::min(K, kk + KTILESIZE); k++){ for(int i = ii; i < std::min(ie, ii + ITILESIZE); i++){ float ar = A[i * K + k]; for(int j = jj; j < std::min(N, jj + JTILESIZE); j++){ C[i * N + j] += ar * B[k * N + j]; } } } } } } return; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; // Threads per nodes mpi_rank = _mpi_rank; mpi_world_size = _mpi_world_size; int numworkers, source, dest, mtype, averow, extra, offset; int my_M; int *my_offset; int *my_rows; MPI_Status status; MPI_Request request = MPI_REQUEST_NULL; my_offset = (int *)malloc(sizeof(int) * mpi_world_size); my_rows = (int *)malloc(sizeof(int) * mpi_world_size); for(int i=0; i 1){ numworkers = mpi_world_size; averow = M / numworkers; extra = M % numworkers; offset = 0; my_M = (mpi_rank==numworkers-1)?(averow+extra):(averow); if(mpi_rank == my_MASTER){ offset = averow; mtype = FROM_MASTER; for(dest=1; dest