chundoong-lab-ta/SamsungDS22/submissions/HW4/ktr.kim/mat_mul.cpp

// vim:ts=2:sw=2:expandtab
#include "mat_mul.h"
#include "util.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static int tslice;
static int BK;
static int BJ;
static int BI;
static int psize; // size of a process

inline int min(const int a, const int b){return a>b ? b : a;}


static void mat_mul_omp() {
  // TODO: parallelize & optimize matrix multiplication
  // Use num_threads per node
  omp_set_num_threads(num_threads);

  tslice = (int) (psize / num_threads);
  BI = 32;
  BJ = 1024;
  BK = 1024;

#pragma omp parallel
  {
    int tid = omp_get_thread_num();
    int tstart = tslice * tid;
    int tend = (tid ==(num_threads-1)) ? psize : tstart + tslice;

    // printf("(tid, tstart, tend) = (%d, %d, %d)\n", tid, tstart, tend);

    float Aik;
    for(int ii=tstart; ii<tend; ii+=BI)
    {
      for(int jj=0; jj<N; jj+=BJ)
      {
        for(int kk=0; kk<K; kk+=BK)
        {
          for (int k=kk; k<min(kk+BK, K); ++k)
          {
            for (int i=ii; i < min(ii+BI, tend); ++i)
            {
              Aik = A[i*K+k];
              // printf("A[%d,%d] = %f\n", i, k, Aik); 
              for (int j=jj; j<min(jj+BJ,N); ++j)
              {
                // printf("B[%d,%d] = %f\n", k, j, B[k*N+j]); 
                C[i * N + j] += Aik * B[k * N + j];
                // printf("C[%d,%d] = %f\n", i, j, C[i*N+j]); 
              }
            }
          }
        }
      }
    }
  }
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
    int _num_threads, int _mpi_rank, int _mpi_world_size) {
  // A = _A,
  // B = _B;
  C = _C;
  M = _M; N = _N; K = _K;
  num_threads = _num_threads, mpi_rank = _mpi_rank,
              mpi_world_size = _mpi_world_size;

  // TODO: parallelize & optimize matrix multiplication on multi-node
  // You must allocate & initialize A, B, C for non-root processes

  MPI_Request requests[3];
  int pshare; // # of shares per process
  int *pstart; // start index of a process
  int *pend; // end index of a process
  int mSizeA; // sending matrix size of A
  int mSizeB = K * N; // sending matrix size of B
  int tag = 1236;

  MPI_Status status;
  // printf("\n");
  pshare = (int) (M / mpi_world_size);

  if(mpi_rank == 0)
  {
    B = _B;
  }
  else
  {
    alloc_mat(&B, K, N);
  }
  MPI_Bcast(B, mSizeB, MPI_FLOAT, 0, MPI_COMM_WORLD);

  if(mpi_rank == 0) // master
  {
    pstart = (int *) malloc(mpi_world_size * sizeof(int));
    pend = (int *) malloc(mpi_world_size * sizeof(int));

    for(int ii=1; ii<mpi_world_size; ++ii)
    {
      //========================================================================
      // size information
      //========================================================================
      pstart[ii] = ii * pshare;
      pend[ii] = (ii == mpi_world_size-1) ? M : pstart[ii] + pshare;
      psize = pend[ii] - pstart[ii];
      mSizeA = K * (pend[ii] - pstart[ii]);

      //========================================================================
      // send matrices
      //========================================================================
      MPI_Send(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
      // MPI_Send(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Send(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Isend(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
      MPI_Isend(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT,
          ii, tag, MPI_COMM_WORLD, &requests[ii-1]);
      // MPI_Isend(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
      // MPI_Wait(&requests[ii-1], &status);
    }

    //==========================================================================
    // matrix multiplication
    //==========================================================================
    A = _A;
    // B = _B;
    psize = pshare;

    // printf("(info, M) matrix A\n");
    // print_mat(A, psize, K);
    mat_mul_omp();

    // printf("(info, M) matrix C\n");
    // print_mat(C, K, N);

    //==========================================================================
    // merge matrix
    //==========================================================================
    for(int ii=1; ii<mpi_world_size; ++ii)
    {
      MPI_Recv(&C[pstart[ii] * N], (pend[ii] - pstart[ii]) * N,
          MPI_FLOAT, ii, tag, MPI_COMM_WORLD, &status);
    }

    // printf("(info, M) matrix C\n");
    // print_mat(C, K, N);
  }
  else
  {
    //==========================================================================
    // recieve parameters
    //==========================================================================
    MPI_Recv(&psize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);

    //==========================================================================
    // recieve matrix A
    //==========================================================================
    // A = (float *) malloc( psize * K * sizeof(float));
    alloc_mat(&A, psize, K);
    MPI_Recv(&A[0], psize * K, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
    // printf("(info, S) %d node\n", mpi_rank);
    // print_mat(A, psize, K);
    // printf("(info, S%d) matrix A\n", mpi_rank);
    // print_mat(A, psize, K);

    //==========================================================================
    // recieve matrix B
    //==========================================================================
    // B = (float *) malloc ( K * N * sizeof(float));
    // alloc_mat(&B, K, N);
    // MPI_Recv(&B[0], K * N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
    // printf("(info, S%d) matrix B\n", mpi_rank);
    // print_mat(B, K, N);

    //==========================================================================
    // matrix multiplication
    //==========================================================================
    // C = (float *) malloc ( psize * N * sizeof(float));
    alloc_mat(&C, psize, N);
    mat_mul_omp();
    // printf("(info, S%d) matrix C\n", mpi_rank);
    // print_mat(C, psize, N);

    //==========================================================================
    // send matrix
    //==========================================================================
    // MPI_Send(C, psize*N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD);
    MPI_Isend(C, psize*N, MPI_FLOAT, 0, tag,
        MPI_COMM_WORLD, &requests[mpi_rank-1]);
  }

  return;
}
. 2022-09-29 18:01:45 +09:00			`// vim:ts=2:sw=2:expandtab`
			`#include "mat_mul.h"`
			`#include "util.h"`

			`#include <cstdio>`
			`#include <cstdlib>`
			`#include <mpi.h>`
			`#include <omp.h>`

			`static float A, B, *C;`
			`static int M, N, K;`
			`static int num_threads;`
			`static int mpi_rank, mpi_world_size;`
			`static int tslice;`
			`static int BK;`
			`static int BJ;`
			`static int BI;`
			`static int psize; // size of a process`

			`inline int min(const int a, const int b){return a>b ? b : a;}`


			`static void mat_mul_omp() {`
			`// TODO: parallelize & optimize matrix multiplication`
			`// Use num_threads per node`
			`omp_set_num_threads(num_threads);`

			`tslice = (int) (psize / num_threads);`
			`BI = 32;`
			`BJ = 1024;`
			`BK = 1024;`

			`#pragma omp parallel`
			`{`
			`int tid = omp_get_thread_num();`
			`int tstart = tslice * tid;`
			`int tend = (tid ==(num_threads-1)) ? psize : tstart + tslice;`

			`// printf("(tid, tstart, tend) = (%d, %d, %d)\n", tid, tstart, tend);`

			`float Aik;`
			`for(int ii=tstart; ii<tend; ii+=BI)`
			`{`
			`for(int jj=0; jj<N; jj+=BJ)`
			`{`
			`for(int kk=0; kk<K; kk+=BK)`
			`{`
			`for (int k=kk; k<min(kk+BK, K); ++k)`
			`{`
			`for (int i=ii; i < min(ii+BI, tend); ++i)`
			`{`
			`Aik = A[i*K+k];`
			`// printf("A[%d,%d] = %f\n", i, k, Aik);`
			`for (int j=jj; j<min(jj+BJ,N); ++j)`
			`{`
			`// printf("B[%d,%d] = %f\n", k, j, B[k*N+j]);`
			`C[i * N + j] += Aik * B[k * N + j];`
			`// printf("C[%d,%d] = %f\n", i, j, C[i*N+j]);`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`

			`void mat_mul(float _A, float _B, float *_C, int _M, int _N, int _K,`
			`int _num_threads, int _mpi_rank, int _mpi_world_size) {`
			`// A = _A,`
			`// B = _B;`
			`C = _C;`
			`M = _M; N = _N; K = _K;`
			`num_threads = _num_threads, mpi_rank = _mpi_rank,`
			`mpi_world_size = _mpi_world_size;`

			`// TODO: parallelize & optimize matrix multiplication on multi-node`
			`// You must allocate & initialize A, B, C for non-root processes`

			`MPI_Request requests[3];`
			`int pshare; // # of shares per process`
			`int *pstart; // start index of a process`
			`int *pend; // end index of a process`
			`int mSizeA; // sending matrix size of A`
			`int mSizeB = K * N; // sending matrix size of B`
			`int tag = 1236;`

			`MPI_Status status;`
			`// printf("\n");`
			`pshare = (int) (M / mpi_world_size);`

			`if(mpi_rank == 0)`
			`{`
			`B = _B;`
			`}`
			`else`
			`{`
			`alloc_mat(&B, K, N);`
			`}`
			`MPI_Bcast(B, mSizeB, MPI_FLOAT, 0, MPI_COMM_WORLD);`

			`if(mpi_rank == 0) // master`
			`{`
			`pstart = (int ) malloc(mpi_world_size sizeof(int));`
			`pend = (int ) malloc(mpi_world_size sizeof(int));`

			`for(int ii=1; ii<mpi_world_size; ++ii)`
			`{`
			`//========================================================================`
			`// size information`
			`//========================================================================`
			`pstart[ii] = ii * pshare;`
			`pend[ii] = (ii == mpi_world_size-1) ? M : pstart[ii] + pshare;`
			`psize = pend[ii] - pstart[ii];`
			`mSizeA = K * (pend[ii] - pstart[ii]);`

			`//========================================================================`
			`// send matrices`
			`//========================================================================`
			`MPI_Send(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);`
			`// MPI_Send(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);`
			`// MPI_Send(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);`
			`// MPI_Isend(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);`
			`MPI_Isend(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT,`
			`ii, tag, MPI_COMM_WORLD, &requests[ii-1]);`
			`// MPI_Isend(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);`
			`// MPI_Wait(&requests[ii-1], &status);`
			`}`

			`//==========================================================================`
			`// matrix multiplication`
			`//==========================================================================`
			`A = _A;`
			`// B = _B;`
			`psize = pshare;`

			`// printf("(info, M) matrix A\n");`
			`// print_mat(A, psize, K);`
			`mat_mul_omp();`

			`// printf("(info, M) matrix C\n");`
			`// print_mat(C, K, N);`

			`//==========================================================================`
			`// merge matrix`
			`//==========================================================================`
			`for(int ii=1; ii<mpi_world_size; ++ii)`
			`{`
			`MPI_Recv(&C[pstart[ii] * N], (pend[ii] - pstart[ii]) * N,`
			`MPI_FLOAT, ii, tag, MPI_COMM_WORLD, &status);`
			`}`

			`// printf("(info, M) matrix C\n");`
			`// print_mat(C, K, N);`
			`}`
			`else`
			`{`
			`//==========================================================================`
			`// recieve parameters`
			`//==========================================================================`
			`MPI_Recv(&psize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);`

			`//==========================================================================`
			`// recieve matrix A`
			`//==========================================================================`
			`// A = (float ) malloc( psize K * sizeof(float));`
			`alloc_mat(&A, psize, K);`
			`MPI_Recv(&A[0], psize * K, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);`
			`// printf("(info, S) %d node\n", mpi_rank);`
			`// print_mat(A, psize, K);`
			`// printf("(info, S%d) matrix A\n", mpi_rank);`
			`// print_mat(A, psize, K);`

			`//==========================================================================`
			`// recieve matrix B`
			`//==========================================================================`
			`// B = (float ) malloc ( K N * sizeof(float));`
			`// alloc_mat(&B, K, N);`
			`// MPI_Recv(&B[0], K * N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);`
			`// printf("(info, S%d) matrix B\n", mpi_rank);`
			`// print_mat(B, K, N);`

			`//==========================================================================`
			`// matrix multiplication`
			`//==========================================================================`
			`// C = (float ) malloc ( psize N * sizeof(float));`
			`alloc_mat(&C, psize, N);`
			`mat_mul_omp();`
			`// printf("(info, S%d) matrix C\n", mpi_rank);`
			`// print_mat(C, psize, N);`

			`//==========================================================================`
			`// send matrix`
			`//==========================================================================`
			`// MPI_Send(C, psize*N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD);`
			`MPI_Isend(C, psize*N, MPI_FLOAT, 0, tag,`
			`MPI_COMM_WORLD, &requests[mpi_rank-1]);`
			`}`

			`return;`
			`}`