chundoong-lab-ta/SamsungDS22/submissions/HW4/jaehyun5.kim/mat_mul.cpp

#include "mat_mul.h"
#include "util.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static int offset,rows;

int min(int a,int b)
{
	return a<b?a:b;
}

static void mat_mul_omp() {
  // TODO: parallelize & optimize matrix multiplication
  // Use num_threads per node
   int index, start, end;

  int offset = M / (mpi_world_size) / num_threads;
  int kk, i, k, j;

  float A_calc;
  int bs = 32;

#pragma omp parallel num_threads(num_threads) //private(index, offset, start, end)
  {
  index = omp_get_thread_num();
  start = index * offset;
  end = (index== num_threads -1) ? rows : (index+1) * offset;
  for (kk = 0; kk < K; kk += bs) {
       for (i = start; i < end; ++i) {
                for (k = kk; k < min(kk + bs, K); k++) {
                        A_calc = A[i * K + k];
			for(j=0;j<N;j++){
				C[i * N + j] += A_calc * B[k * N + j];
       			}
 		}
  	}
   }
  }
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
             int _num_threads, int _mpi_rank, int _mpi_world_size) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads, mpi_rank = _mpi_rank,
  mpi_world_size = _mpi_world_size;
  MPI_Status status;
  MPI_Request request;

  // TODO: parallelize & optimize matrix multiplication on multi-node
  // You must allocate & initialize A, B, C for non-root processes
  //
  
  if(mpi_rank==0){
	  int row_size = M / mpi_world_size;
	  int st, ed;

	  for(int node =1;node<mpi_world_size;node++){
		  st = offset = node * row_size;
		  ed = node==mpi_world_size-1 ?M : (node+1)*row_size;
		  rows = ed-st;

		  MPI_Isend(&offset, 1, MPI_INT, node, 1, MPI_COMM_WORLD, &request);
		  MPI_Isend(&rows, 1, MPI_INT, node, 1, MPI_COMM_WORLD, &request);
		  MPI_Isend(&A[offset*K], rows*K, MPI_FLOAT, node, 1, MPI_COMM_WORLD, &request);
		  MPI_Isend(B, K*N, MPI_FLOAT, node, 1, MPI_COMM_WORLD, &request);
	  }
	  rows = row_size;
	  mat_mul_omp();

	  for(int node =1;node<mpi_world_size;node++){
		  MPI_Recv(&offset, 1, MPI_INT, node, 2, MPI_COMM_WORLD, &status);
		  MPI_Recv(&rows, 1, MPI_INT, node, 2, MPI_COMM_WORLD, &status);
		  MPI_Recv(&C[offset*N], rows*N, MPI_FLOAT, node, 2, MPI_COMM_WORLD, &status);
	  }
  }
  else{
	  alloc_mat(&A, M, K);
	  alloc_mat(&B, K, N);
	  alloc_mat(&C, M, N);
	  zero_mat(C, M, N);

	  MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
	  MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
	  MPI_Recv(A, rows*K, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status);
	  MPI_Recv(B, K*N, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status);

	  mat_mul_omp();

	  MPI_Isend(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request);
	  MPI_Isend(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request);
	  MPI_Isend(C, rows*N, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &request);
  }

}
. 2022-09-29 18:01:45 +09:00			`#include "mat_mul.h"`
			`#include "util.h"`

			`#include <cstdio>`
			`#include <cstdlib>`
			`#include <mpi.h>`
			`#include <omp.h>`

			`static float A, B, *C;`
			`static int M, N, K;`
			`static int num_threads;`
			`static int mpi_rank, mpi_world_size;`
			`static int offset,rows;`

			`int min(int a,int b)`
			`{`
			`return a<b?a:b;`
			`}`

			`static void mat_mul_omp() {`
			`// TODO: parallelize & optimize matrix multiplication`
			`// Use num_threads per node`
			`int index, start, end;`

			`int offset = M / (mpi_world_size) / num_threads;`
			`int kk, i, k, j;`

			`float A_calc;`
			`int bs = 32;`

			`#pragma omp parallel num_threads(num_threads) //private(index, offset, start, end)`
			`{`
			`index = omp_get_thread_num();`
			`start = index * offset;`
			`end = (index== num_threads -1) ? rows : (index+1) * offset;`
			`for (kk = 0; kk < K; kk += bs) {`
			`for (i = start; i < end; ++i) {`
			`for (k = kk; k < min(kk + bs, K); k++) {`
			`A_calc = A[i * K + k];`
			`for(j=0;j<N;j++){`
			`C[i * N + j] += A_calc * B[k * N + j];`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`

			`void mat_mul(float _A, float _B, float *_C, int _M, int _N, int _K,`
			`int _num_threads, int _mpi_rank, int _mpi_world_size) {`
			`A = _A, B = _B, C = _C;`
			`M = _M, N = _N, K = _K;`
			`num_threads = _num_threads, mpi_rank = _mpi_rank,`
			`mpi_world_size = _mpi_world_size;`
			`MPI_Status status;`
			`MPI_Request request;`

			`// TODO: parallelize & optimize matrix multiplication on multi-node`
			`// You must allocate & initialize A, B, C for non-root processes`
			`//`

			`if(mpi_rank==0){`
			`int row_size = M / mpi_world_size;`
			`int st, ed;`

			`for(int node =1;node<mpi_world_size;node++){`
			`st = offset = node * row_size;`
			`ed = node==mpi_world_size-1 ?M : (node+1)*row_size;`
			`rows = ed-st;`

			`MPI_Isend(&offset, 1, MPI_INT, node, 1, MPI_COMM_WORLD, &request);`
			`MPI_Isend(&rows, 1, MPI_INT, node, 1, MPI_COMM_WORLD, &request);`
			`MPI_Isend(&A[offsetK], rowsK, MPI_FLOAT, node, 1, MPI_COMM_WORLD, &request);`
			`MPI_Isend(B, K*N, MPI_FLOAT, node, 1, MPI_COMM_WORLD, &request);`
			`}`
			`rows = row_size;`
			`mat_mul_omp();`

			`for(int node =1;node<mpi_world_size;node++){`
			`MPI_Recv(&offset, 1, MPI_INT, node, 2, MPI_COMM_WORLD, &status);`
			`MPI_Recv(&rows, 1, MPI_INT, node, 2, MPI_COMM_WORLD, &status);`
			`MPI_Recv(&C[offsetN], rowsN, MPI_FLOAT, node, 2, MPI_COMM_WORLD, &status);`
			`}`
			`}`
			`else{`
			`alloc_mat(&A, M, K);`
			`alloc_mat(&B, K, N);`
			`alloc_mat(&C, M, N);`
			`zero_mat(C, M, N);`

			`MPI_Recv(&offset, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);`
			`MPI_Recv(&rows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);`
			`MPI_Recv(A, rows*K, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status);`
			`MPI_Recv(B, K*N, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &status);`

			`mat_mul_omp();`

			`MPI_Isend(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request);`
			`MPI_Isend(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &request);`
			`MPI_Isend(C, rows*N, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &request);`
			`}`

			`}`