chundoong-lab-ta/SamsungDS22/submissions/HW4/yw0.kim/mat_mul.cpp

#include "mat_mul.h"
#include "util.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>

static int isAlloc = 0;
static MPI_Request request;
static MPI_Status status;
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static int proc_start, proc_end, num_rows;

#define NTILE_SIZE 1024
#define KTILE_SIZE 32

#define UNROLL_SIZE 4
#define min(A, B) (((A) > (B)) ? (B) : (A))

static void mat_mul_omp()
{
  // TODO: parallelize & optimize matrix multiplication
  // Use num_threads per node

  #pragma omp parallel
  {
    int tid = omp_get_thread_num();
    int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
    int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
    int ks = KTILE_SIZE;
    int ns = NTILE_SIZE;
    float Ark = 0.0;

    #pragma omp parallel for shared(A, B, C) schedule(dynamic)
    for (int k_tile = 0; k_tile < K; k_tile += ks)
    {
      for (int n_tile = 0; n_tile < N; n_tile += ns)
      {
        for (int r = row_start; r < row_end; r++)
        {
          int k_limit = min(k_tile + ks, K);
          for (int k = k_tile; k < k_limit; k++)
          {
            Ark = A[r * K + k];
            int unroll_size = UNROLL_SIZE;
            int n_limit = min(n_tile + ns, N);
            int n_unroll_limit = (n_limit / unroll_size) * unroll_size;

            for (int n = n_tile; n < n_unroll_limit; n += unroll_size)
            {
            #if UNROLL_SIZE >= 4
              C[r * N + n] += Ark * B[k * N + n];
              C[r * N + n + 1] += Ark * B[k * N + n + 1];
              C[r * N + n + 2] += Ark * B[k * N + n + 2];
              C[r * N + n + 3] += Ark * B[k * N + n + 3];
            #endif
            #if UNROLL_SIZE == 8
              C[r * N + n + 4] += Ark * B[k * N + n + 4];
              C[r * N + n + 5] += Ark * B[k * N + n + 5];
              C[r * N + n + 6] += Ark * B[k * N + n + 6];
              C[r * N + n + 7] += Ark * B[k * N + n + 7];
            #endif
            }

            for (int n = n_unroll_limit; n < n_limit; n++)
            {
              C[r * N + n] += Ark * B[k * N + n];
            }
          }
        }
      }
    }
  }

  // #pragma omp parallel
  // {
  //   int tid = omp_get_thread_num();
  //   int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
  //   int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
  //   int is = ITILE_SIZE;
  //   int js = JTILE_SIZE;
  //   int ks = KTILE_SIZE;

  //   #pragma omp parallel for shared(A,B,C) schedule(dynamic)
  //   for (int ii = row_start; ii < row_end; ii += is)
  //   {
  //     for (int jj = 0; jj < N; jj += js)
  //     {
  //       for(int kk = 0; kk < K; kk += ks)
  //       {
  //         int k_limit = min(kk + ks, K);
  //         for(int k = kk; k < k_limit; k++)
  //         {
  //           int i_limit = min(ii + is, row_end);
  //           for(int i = ii; i < i_limit; i++)
  //           {
  //             float Ar = A[i * K + k];
  //             int j_limit = min(jj + js, N);
  //             int unroll_size = UNROLL_SIZE;
  //             int unroll_limit = (j_limit / unroll_size) * unroll_size;
  //             // for(int j = jj; j < j_limit; j++)
  //             for (int j = jj; j < unroll_limit; j += unroll_size)
  //             {
  //               C[i * N + j] += Ar * B[k * N + j];
  //               C[i * N + j + 1] += Ar * B[k * N + j + 1];
  //               C[i * N + j + 2] += Ar * B[k * N + j + 2];
  //               C[i * N + j + 3] += Ar * B[k * N + j + 3];
  //             }

  //             for(int j = unroll_limit; j < j_limit; j++)
  //             {
  //               C[i * N + j] += Ar * B[k * N + j];
  //             }
  //           }
  //         }
  //       }
  //     }
  //   }
  // }
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
             int _num_threads, int _mpi_rank, int _mpi_world_size)
{
  if (isAlloc == 0)
  {
    A = _A, B = _B, C = _C;
  }
  M = _M, N = _N, K = _K;

  num_threads = _num_threads, mpi_rank = _mpi_rank,
  mpi_world_size = _mpi_world_size;

  // TODO: parallelize & optimize matrix multiplication on multi-node
  // You must allocate & initialize A, B, C for non-root processes

  omp_set_num_threads(num_threads);

  if (mpi_world_size == 1)
  {
    // Node #0 do everythings.
    proc_start = 0;
    proc_end = M;
    num_rows = proc_end - proc_start;
    mat_mul_omp();
  }
  else
  {
    if (mpi_rank == 0)
    {
      for (int dest = 1; dest < mpi_world_size; dest++)
      {
        // Calculate index of target matrix C.
        proc_start = M / mpi_world_size * dest + min(dest, M % mpi_world_size);
        proc_end = M / mpi_world_size * (dest + 1) + min(dest + 1, M % mpi_world_size);
        num_rows = proc_end - proc_start;
        // Send start and end index to other nodes.
        MPI_Isend(&proc_start, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
        MPI_Isend(&proc_end, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
        // Send A and B matrix to other nodes.
        MPI_Isend(&A[proc_start * K], num_rows * K, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
        MPI_Isend(B, K * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
      }

      proc_start = M / mpi_world_size * mpi_rank + min(mpi_rank, M % mpi_world_size);
      proc_end = M / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, M % mpi_world_size);
      num_rows = proc_end - proc_start;
    }
    else
    {
      int source = 0;
      // Receive start and end index.
      MPI_Recv(&proc_start, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(&proc_end, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      num_rows = proc_end - proc_start;

      if (isAlloc == 0)
      {
        // Allocate memory address for A, B, and C at first function call.
        alloc_mat(&A, num_rows, K);
        alloc_mat(&B, K, N);
        alloc_mat(&C, num_rows, N);
        isAlloc = true;
      }
      // Zero padding every function call.
      zero_mat(C, num_rows, N);

      // Receive A and B matrix from node #0.
      MPI_Recv(A, num_rows * K, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(B, K * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
    }

    // Do matrix multiplication.
    mat_mul_omp();

    if (mpi_rank == 0)
    {
      // Receive partial C from other nodes.
      for (int source = 1; source < mpi_world_size; source++)
      {
        proc_start = M / mpi_world_size * source + min(source, M % mpi_world_size);
        proc_end = M / mpi_world_size * (source + 1) + min(source + 1, M % mpi_world_size);
        num_rows = proc_end - proc_start;
        MPI_Recv(&C[proc_start * N], num_rows * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      }
    }
    else
    {
      // Send result of calculation of each node.
      int dest = 0;
      MPI_Isend(C, num_rows * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
    }
  }
}