chundoong-lab-ta/SamsungDS22/submissions/HW4/dk2003.lim/mat_mul.cpp

#include "mat_mul.h"

#include <cstdio>
#include <cstdlib>
#include <mpi.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;


/*
static void mat_mul_omp() {
  // TODO: parallelize & optimize matrix multiplication
  // Use num_threads per node
#pragma omp parallel for
  for (int i = 0; i < M; ++i) {
    for (int j = 0; j < N; ++j) {
      for (int k = 0; k < K; ++k) {
        C[i * N + j] += A[i * K + k] * B[k * N + j];
      }
    }
  }
}
*/

#include "util.h"
#define MAX_RANKS	4
#define min(x,y) (x) < (y) ? (x) : (y)

// Since it is very small size data, I store rows and offset, no need to send
int rows_rank[MAX_RANKS], offset_rank[MAX_RANKS];

static void mat_mul_omp() {
	int ITILESIZE=32;
	int JTILESIZE=1024;
	int KTILESIZE=1024;
	int is = 0;
	int ie = rows_rank[mpi_rank];

	//#pragma omp parallel for schedule(dynamic)
	// No limit for number of threads: 470 GFLOPS
	#pragma omp parallel for schedule(dynamic) num_threads(num_threads)
	// Limit for number of threads: 320 ~ 350 GFLOPS
	for (int ii = is; ii < ie; ii += ITILESIZE) {
		int min_ii = ((ii + ITILESIZE) < M) ? (ii+ITILESIZE): M;
		for (int jj = 0; jj < N; jj += JTILESIZE) {
			int min_jj = ((jj + JTILESIZE) < N) ? (jj+JTILESIZE): N;
			for (int kk = 0; kk < K; kk += KTILESIZE) {
				int min_kk = ((kk + KTILESIZE) < K) ? (kk+KTILESIZE): K;

				for (int k = kk; k < min_kk; k++) {
					for (int i = ii; i < min_ii; i++) {
						float Aik = A[i * K + k];
						for (int j = jj; j < min_jj; j+=1) {
							C[i * N + j] += Aik * B[k * N + j];
						}
					}
				}
			}
		}
	}
}


void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
             int _num_threads, int _mpi_rank, int _mpi_world_size) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads, mpi_rank = _mpi_rank,
  mpi_world_size = _mpi_world_size;

  // TODO: parallelize & optimize matrix multiplication on multi-node
  // You must allocate & initialize A, B, C for non-root processes

  // FIXME: for now, only root process runs the matrix multiplication.

  MPI_Request request;
  MPI_Status	status;

	int working_ranks = mpi_world_size;
	int row_space = M / working_ranks;
	int last_row_space = row_space + M % row_space;

	// Calculate global value rows_rank and offset_rank for the row of matrix A.
	for(int i=0; i < working_ranks; i++) {
	    //rows_rank[i] = (i== working_ranks - 1) ? (M - (row_space * (working_ranks -1))) : row_space;
	    rows_rank[i] = (i== working_ranks - 1) ? last_row_space : row_space;
			offset_rank[i+1] = offset_rank[i] + rows_rank[i];
	}


	// Matrix allocation for rank 1, 2, 3, ... since there is no Matrix allocation.
	if(mpi_rank != 0) {
			M = rows_rank[mpi_rank];		// Updated the size of the row of matrix A accoring to the rank
			alloc_mat(&A, rows_rank[mpi_rank], K);
			alloc_mat(&B, K, N);
			alloc_mat(&C, rows_rank[mpi_rank], N);
	}

	// Broadcast B since it is wholely used in every working rank.
	MPI_Bcast(B, K*N, MPI_FLOAT, 0, MPI_COMM_WORLD);

	if(mpi_rank == 0) {
	  for(int i=1; i < working_ranks; i++)
			MPI_Isend(&A[offset_rank[i]*K], rows_rank[i] * K, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &request);
	}
	else {
			MPI_Recv(A, rows_rank[mpi_rank] * K, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
	}

	// Matrix multiplication for each rank according to its size
	mat_mul_omp();


	// Gather the whole results
	if(mpi_rank == 0) {
			for(int i=1; i < working_ranks; i++)
					MPI_Recv(&C[offset_rank[i]*N], rows_rank[i]*N, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &status);
	}
	else {
			MPI_Isend(C, rows_rank[mpi_rank] * N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
	}


//	MPI_Finalize();
//	free(A);
//	free(B);
//	free(C);

}