chundoong-lab-ta/SamsungDS22/submissions/HW4/ktr.kim/mat_mul.cpp

// vim:ts=2:sw=2:expandtab
#include "mat_mul.h"
#include "util.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static int tslice;
static int BK;
static int BJ;
static int BI;
static int psize; // size of a process

inline int min(const int a, const int b){return a>b ? b : a;}


static void mat_mul_omp() {
  // TODO: parallelize & optimize matrix multiplication
  // Use num_threads per node
  omp_set_num_threads(num_threads);

  tslice = (int) (psize / num_threads);
  BI = 32;
  BJ = 1024;
  BK = 1024;

#pragma omp parallel
  {
    int tid = omp_get_thread_num();
    int tstart = tslice * tid;
    int tend = (tid ==(num_threads-1)) ? psize : tstart + tslice;

    // printf("(tid, tstart, tend) = (%d, %d, %d)\n", tid, tstart, tend);

    float Aik;
    for(int ii=tstart; ii<tend; ii+=BI)
    {
      for(int jj=0; jj<N; jj+=BJ)
      {
        for(int kk=0; kk<K; kk+=BK)
        {
          for (int k=kk; k<min(kk+BK, K); ++k)
          {
            for (int i=ii; i < min(ii+BI, tend); ++i)
            {
              Aik = A[i*K+k];
              // printf("A[%d,%d] = %f\n", i, k, Aik);
              for (int j=jj; j<min(jj+BJ,N); ++j)
              {
                // printf("B[%d,%d] = %f\n", k, j, B[k*N+j]);
                C[i * N + j] += Aik * B[k * N + j];
                // printf("C[%d,%d] = %f\n", i, j, C[i*N+j]);
              }
            }
          }
        }
      }
    }
  }
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
    int _num_threads, int _mpi_rank, int _mpi_world_size) {
  // A = _A,
  // B = _B;
  C = _C;
  M = _M; N = _N; K = _K;
  num_threads = _num_threads, mpi_rank = _mpi_rank,
              mpi_world_size = _mpi_world_size;

  // TODO: parallelize & optimize matrix multiplication on multi-node
  // You must allocate & initialize A, B, C for non-root processes

  MPI_Request requests[3];
  int pshare; // # of shares per process
  int *pstart; // start index of a process
  int *pend; // end index of a process
  int mSizeA; // sending matrix size of A
  int mSizeB = K * N; // sending matrix size of B
  int tag = 1236;

  MPI_Status status;
  // printf("\n");
  pshare = (int) (M / mpi_world_size);

  if(mpi_rank == 0)
  {
    B = _B;
  }
  else
  {
    alloc_mat(&B, K, N);
  }
  MPI_Bcast(B, mSizeB, MPI_FLOAT, 0, MPI_COMM_WORLD);

  if(mpi_rank == 0) // master
  {
    pstart = (int *) malloc(mpi_world_size * sizeof(int));
    pend = (int *) malloc(mpi_world_size * sizeof(int));

    for(int ii=1; ii<mpi_world_size; ++ii)
    {
      //========================================================================
      // size information
      //========================================================================
      pstart[ii] = ii * pshare;
      pend[ii] = (ii == mpi_world_size-1) ? M : pstart[ii] + pshare;
      psize = pend[ii] - pstart[ii];
      mSizeA = K * (pend[ii] - pstart[ii]);

      //========================================================================
      // send matrices
      //========================================================================
      MPI_Send(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
      // MPI_Send(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Send(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Isend(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
      MPI_Isend(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT,
          ii, tag, MPI_COMM_WORLD, &requests[ii-1]);
      // MPI_Isend(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Wait(&requests[ii-1], &status);
    }

    //==========================================================================
    // matrix multiplication
    //==========================================================================
    A = _A;
    // B = _B;
    psize = pshare;

    // printf("(info, M) matrix A\n");
    // print_mat(A, psize, K);
    mat_mul_omp();

    // printf("(info, M) matrix C\n");
    // print_mat(C, K, N);

    //==========================================================================
    // merge matrix
    //==========================================================================
    for(int ii=1; ii<mpi_world_size; ++ii)
    {
      MPI_Recv(&C[pstart[ii] * N], (pend[ii] - pstart[ii]) * N,
          MPI_FLOAT, ii, tag, MPI_COMM_WORLD, &status);
    }

    // printf("(info, M) matrix C\n");
    // print_mat(C, K, N);
  }
  else
  {
    //==========================================================================
    // recieve parameters
    //==========================================================================
    MPI_Recv(&psize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);

    //==========================================================================
    // recieve matrix A
    //==========================================================================
    // A = (float *) malloc( psize * K * sizeof(float));
    alloc_mat(&A, psize, K);
    MPI_Recv(&A[0], psize * K, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
    // printf("(info, S) %d node\n", mpi_rank);
    // print_mat(A, psize, K);
    // printf("(info, S%d) matrix A\n", mpi_rank);
    // print_mat(A, psize, K);

    //==========================================================================
    // recieve matrix B
    //==========================================================================
    // B = (float *) malloc ( K * N * sizeof(float));
    // alloc_mat(&B, K, N);
    // MPI_Recv(&B[0], K * N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
    // printf("(info, S%d) matrix B\n", mpi_rank);
    // print_mat(B, K, N);

    //==========================================================================
    // matrix multiplication
    //==========================================================================
    // C = (float *) malloc ( psize * N * sizeof(float));
    alloc_mat(&C, psize, N);
    mat_mul_omp();
    // printf("(info, S%d) matrix C\n", mpi_rank);
    // print_mat(C, psize, N);

    //==========================================================================
    // send matrix
    //==========================================================================
    // MPI_Send(C, psize*N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD);
    MPI_Isend(C, psize*N, MPI_FLOAT, 0, tag,
        MPI_COMM_WORLD, &requests[mpi_rank-1]);
  }

  return;
}