chundoong-lab-ta/SamsungDS22/submissions/HW4/seong81.kim/mat_mul.cpp

#include "mat_mul.h"
#include "util.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>

#define ITILESIZE (32)
#define JTILESIZE (1024)
#define KTILESIZE (1024)

#define  MASTER_TO_SLAVE_TAG 0
#define  SLAVE_TO_MASTER_TAG 10

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;


MPI_Status status0; // store status of a MPI_Recv
MPI_Status status1; // store status of a MPI_Recv
MPI_Status status2; // store status of a MPI_Recv
MPI_Request request0; //capture request of a MPI_Isend
MPI_Request request1; //capture request of a MPI_Isend
MPI_Request request2; //capture request of a MPI_Isend

int inline func_min(int a, int b){
    if (a > b)  return  b   ;
    else        return  a   ;
}

static void  mat_mul_omp(int high) {

    omp_set_num_threads(num_threads);

#pragma omp parallel for schedule(static)
    for (int ii = 0; ii < high; ii += ITILESIZE) {
        for (int jj = 0; jj < N; jj += JTILESIZE) {
            for (int kk = 0; kk < K; kk += KTILESIZE) {
                for (int k = kk; k < func_min(K, kk + KTILESIZE); k++) {
                    for (int i = ii; i < func_min(high, ii + ITILESIZE); i++) {
                        float ar = A[i * K + k];
                        for (int j = jj; j < func_min(N, jj + JTILESIZE); j+=1) {
                            C[i * N + j] += ar * B[k * N + j];
                        }
                    }
                }
            }
        }
    }
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
             int _num_threads, int _mpi_rank, int _mpi_world_size) {
    A = _A, B = _B, C = _C;
    M = _M, N = _N, K = _K;
    num_threads = _num_threads, mpi_rank = _mpi_rank,
    mpi_world_size = _mpi_world_size;

    if (mpi_rank == 0) {
        int  i              ;
        int  M_div_node     ;
        int  real_num_node  ;
        int  node_unit      ;
        int  size           ;
        int  header[2]      ;
        int  A_base         ;
        int  C_base         ;
        int  low_M, high_M  ;

        //  Compute Real number of node which works to use
        M_div_node = M / mpi_world_size ;
        if (M_div_node == 0) {
            real_num_node = M ;
            node_unit     = 1 ;
        } else if (M_div_node * mpi_world_size < M){
            real_num_node = mpi_world_size  ;
            node_unit     = M_div_node + 1  ;
        } else {
            real_num_node = mpi_world_size  ;
            node_unit     = M_div_node      ;
        }

        //  Send index of divided A matrix
        for (i = 1 ; i < real_num_node; i++){
            low_M       =   i * node_unit                       ;
            high_M      =   func_min( (low_M + node_unit), M)   ;
            size        =   high_M - low_M                      ;
            header[0]   =   low_M                               ;
            header[1]   =   size                                ;
            A_base      =   low_M * K                           ;

            MPI_Isend(header,     2,        MPI_INT,   i, MASTER_TO_SLAVE_TAG,     MPI_COMM_WORLD, &request0);
            MPI_Isend(&A[A_base], size * K, MPI_FLOAT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request1);
            MPI_Isend( B        , K    * N, MPI_FLOAT, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request2);
        }
        for (i = real_num_node; i < mpi_world_size ; i++){
            header[0]   =   -100    ;
            header[1]   =   -100    ;
            MPI_Isend(header    , 2,        MPI_INT,   i, MASTER_TO_SLAVE_TAG + 3, MPI_COMM_WORLD, &request0);
        }

      //MPI_Bcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);


        //  Do multiplication dedicated to Rank0
        mat_mul_omp(func_min( node_unit, M));

        //  Receives multiplication from each worker
        for (i = 1; i < real_num_node; i++) {
            MPI_Recv(header,        2,        MPI_INT,   i, SLAVE_TO_MASTER_TAG    , MPI_COMM_WORLD, &status0);
            C_base  =   header[0] * N   ;
            size    =   header[1] * N   ;

            MPI_Recv(&C[C_base],  size, MPI_FLOAT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status1);
        }
    }
    else {
        int header[2];
        int size    ;

        MPI_Recv(header,        2,                    MPI_INT,   0, MASTER_TO_SLAVE_TAG,     MPI_COMM_WORLD, &status0);

        if (header[0] != -100) {
            size    =   header[1];

            alloc_mat(&A, size, K);
            MPI_Recv(A, size * K, MPI_FLOAT, 0, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status1);

            alloc_mat(&B, K, N);
            MPI_Recv(B, K * N, MPI_FLOAT, 0, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status2);
          //MPI_Bcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);

            alloc_mat(&C, size, N);
            zero_mat(C, size, N);

          //timer_start(mpi_rank);
            mat_mul_omp(size);
          //double a=timer_stop(mpi_rank);
          //printf("\n  rank%d time = %f\n\n\n", mpi_rank, a);

            MPI_Isend(header,        2,         MPI_INT,   0, SLAVE_TO_MASTER_TAG,     MPI_COMM_WORLD, &request0) ;
            MPI_Isend(C    ,        size* N,   MPI_FLOAT, 0, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request1) ;
        }
        else {
            MPI_Bcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
        }
    }
}