#include "mat_mul.h" #include "util.h" #include #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; static int mpi_rank, mpi_world_size; #define min(a,b) (((a) < (b)) ? (a) : (b)) #define MAX_NUM_OF_NODES (4) #define ITILESIZE (32) #define JTILESIZE (1024) #define KTILESIZE (1024) static void mat_mul_omp(int rows) { omp_set_num_threads(num_threads); #pragma omp parallel shared(A, B, C, rows, N, K, num_threads) { int tid = omp_get_thread_num(); int is = rows / num_threads * tid + min(tid, rows % num_threads); int ie = rows / num_threads * (tid + 1) + min(tid + 1, rows % num_threads); for (int ii = is; ii < ie; ii += ITILESIZE) { for (int jj = 0; jj < N; jj += JTILESIZE) { for (int kk = 0; kk < K; kk += KTILESIZE) { for (int k = kk; k < min(kk + KTILESIZE, K); k++) { for (int i = ii; i < min(ii + ITILESIZE, ie); i++) { float ar = A[i * K + k]; for (int j = jj; j < min(jj + JTILESIZE, N); j++) { C[i * N + j] += ar * B[k * N + j]; } } } } } } } return; } void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads, mpi_rank = _mpi_rank, mpi_world_size = _mpi_world_size; int divided_rows[MAX_NUM_OF_NODES]; int offset[MAX_NUM_OF_NODES] = {0}; int divided_row, remainder, M_new; int tmp = 0; MPI_Status status; // TODO: parallelize & optimize matrix multiplication on multi-node // You must allocate & initialize A, B, C for non-root processes // FIXME: for now, only root process runs the matrix multiplication. if (mpi_rank == 0) { divided_row = M / mpi_world_size; remainder = M - divided_row * mpi_world_size; // Larger numbered nodes compute more rows if(remainder != 0) { for (int i = 0; i < (mpi_world_size - remainder); i++) { divided_rows[i] = divided_row; } for (int i = (mpi_world_size - remainder); i < mpi_world_size; i++) { divided_rows[i] = divided_row + 1; } } else { for (int i = 0; i < mpi_world_size; i++) { divided_rows[i] = divided_row; } } for (int i = 1; i < mpi_world_size; i++) { tmp += divided_rows[i - 1]; offset[i] = tmp; // Starting row number divided by node } // Send data to other nodes (tag = 0) for(int i = 1; i < mpi_world_size; i++) { MPI_Send(÷d_rows[i], 1, MPI_INT, i, 0, MPI_COMM_WORLD); MPI_Send(&K, 1, MPI_INT, i, 0, MPI_COMM_WORLD); MPI_Send(&N, 1, MPI_INT, i, 0, MPI_COMM_WORLD); MPI_Send(&A[offset[i] * K], divided_rows[i] * K, MPI_FLOAT, i, 0, MPI_COMM_WORLD); MPI_Send(B, K * N, MPI_FLOAT, i, 0, MPI_COMM_WORLD); } // Calculate mat mul for root node part mat_mul_omp(divided_rows[0]); // Waiting until the each nodes sent their result (tag = 1) for(int i = 1; i < mpi_world_size; i++) { MPI_Recv(&C[offset[i] * N], divided_rows[i] * N, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &status); } } else { // Receive data from root node (tag = 0) MPI_Recv(&M_new, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(&K, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(&N, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status); // allocate for matrix alloc_mat(&A, M_new, K); alloc_mat(&B, K, N); alloc_mat(&C, M_new, N); // Receive divied A mat & B mat MPI_Recv(A, M_new * K, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(B, K * N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); mat_mul_omp(M_new); // Send result to root node (tag = 1) MPI_Send(C, M_new * N, MPI_FLOAT, 0, 1, MPI_COMM_WORLD); } }