#include "mat_mul.h" #include "omp.h" #include "util.h" #include #include #include #define MIN(a,b) ((a>b) ? b : a) #define ITILESIZE (20) #define JTILESIZE (1024) #define KTILESIZE (1024) static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; static int rows,offset; MPI_Status status; MPI_Request request; static void mat_mul_omp() { #pragma omp parallel for for (int ii = 0; ii < rows; ii += ITILESIZE) { for (int jj = 0; jj < N; jj += JTILESIZE) { for (int kk = 0; kk < K; kk += KTILESIZE) { for (int k = kk; k < MIN(kk + KTILESIZE,K); k++) { for (int i = ii; i < MIN(ii + ITILESIZE, rows); i++) { float ar = A[i * K + k]; for (int j = jj; j < MIN(jj + JTILESIZE, N); j++) { C[i * N + j] += ar * B[k * N + j]; } } } } } } } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) { int source,dest,remain; M = _M, N = _N, K = _K; num_threads = _num_threads, mpi_rank = _mpi_rank, mpi_world_size = _mpi_world_size; omp_set_num_threads(num_threads); if (mpi_rank == 0) { /* send matrix data to the worker tasks */ A = _A, B = _B, C = _C; rows = M/(mpi_world_size); remain = M%(mpi_world_size); offset = remain; for (dest=1; dest < mpi_world_size; dest++) { offset = offset + rows; MPI_Isend(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request); MPI_Isend(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request); MPI_Isend(&A[offset*K], rows*K, MPI_FLOAT,dest,1, MPI_COMM_WORLD, &request); MPI_Isend(B, K*N, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request); } offset = 0; rows = offset+rows+remain; mat_mul_omp(); /* wait for results from all worker tasks */ for (source = 1; source < mpi_world_size; source++) { MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); MPI_Recv(&C[offset*N], rows*N, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status); } } /*---------------------------- worker----------------------------*/ if (mpi_rank > 0) { /* Matrix multiplication */ MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status); alloc_mat(&A, rows, K); alloc_mat(&B, K, N); alloc_mat(&C, rows, N); zero_mat(C, rows, N); MPI_Recv(A, rows*K, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status); MPI_Recv(B, K*N, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status); mat_mul_omp(); MPI_Isend(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request); MPI_Isend(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request); MPI_Isend(C, rows*N, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &request); } }