chundoong-lab-ta/SamsungDS22/submissions/HW4/yw0.kim/mat_mul.cpp

219 lines
6.9 KiB
C++

#include "mat_mul.h"
#include "util.h"
#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <omp.h>
static int isAlloc = 0;
static MPI_Request request;
static MPI_Status status;
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static int proc_start, proc_end, num_rows;
#define NTILE_SIZE 1024
#define KTILE_SIZE 32
#define UNROLL_SIZE 4
#define min(A, B) (((A) > (B)) ? (B) : (A))
static void mat_mul_omp()
{
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#pragma omp parallel
{
int tid = omp_get_thread_num();
int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
int ks = KTILE_SIZE;
int ns = NTILE_SIZE;
float Ark = 0.0;
#pragma omp parallel for shared(A, B, C) schedule(dynamic)
for (int k_tile = 0; k_tile < K; k_tile += ks)
{
for (int n_tile = 0; n_tile < N; n_tile += ns)
{
for (int r = row_start; r < row_end; r++)
{
int k_limit = min(k_tile + ks, K);
for (int k = k_tile; k < k_limit; k++)
{
Ark = A[r * K + k];
int unroll_size = UNROLL_SIZE;
int n_limit = min(n_tile + ns, N);
int n_unroll_limit = (n_limit / unroll_size) * unroll_size;
for (int n = n_tile; n < n_unroll_limit; n += unroll_size)
{
#if UNROLL_SIZE >= 4
C[r * N + n] += Ark * B[k * N + n];
C[r * N + n + 1] += Ark * B[k * N + n + 1];
C[r * N + n + 2] += Ark * B[k * N + n + 2];
C[r * N + n + 3] += Ark * B[k * N + n + 3];
#endif
#if UNROLL_SIZE == 8
C[r * N + n + 4] += Ark * B[k * N + n + 4];
C[r * N + n + 5] += Ark * B[k * N + n + 5];
C[r * N + n + 6] += Ark * B[k * N + n + 6];
C[r * N + n + 7] += Ark * B[k * N + n + 7];
#endif
}
for (int n = n_unroll_limit; n < n_limit; n++)
{
C[r * N + n] += Ark * B[k * N + n];
}
}
}
}
}
}
// #pragma omp parallel
// {
// int tid = omp_get_thread_num();
// int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
// int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
// int is = ITILE_SIZE;
// int js = JTILE_SIZE;
// int ks = KTILE_SIZE;
// #pragma omp parallel for shared(A,B,C) schedule(dynamic)
// for (int ii = row_start; ii < row_end; ii += is)
// {
// for (int jj = 0; jj < N; jj += js)
// {
// for(int kk = 0; kk < K; kk += ks)
// {
// int k_limit = min(kk + ks, K);
// for(int k = kk; k < k_limit; k++)
// {
// int i_limit = min(ii + is, row_end);
// for(int i = ii; i < i_limit; i++)
// {
// float Ar = A[i * K + k];
// int j_limit = min(jj + js, N);
// int unroll_size = UNROLL_SIZE;
// int unroll_limit = (j_limit / unroll_size) * unroll_size;
// // for(int j = jj; j < j_limit; j++)
// for (int j = jj; j < unroll_limit; j += unroll_size)
// {
// C[i * N + j] += Ar * B[k * N + j];
// C[i * N + j + 1] += Ar * B[k * N + j + 1];
// C[i * N + j + 2] += Ar * B[k * N + j + 2];
// C[i * N + j + 3] += Ar * B[k * N + j + 3];
// }
// for(int j = unroll_limit; j < j_limit; j++)
// {
// C[i * N + j] += Ar * B[k * N + j];
// }
// }
// }
// }
// }
// }
// }
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
int _num_threads, int _mpi_rank, int _mpi_world_size)
{
if (isAlloc == 0)
{
A = _A, B = _B, C = _C;
}
M = _M, N = _N, K = _K;
num_threads = _num_threads, mpi_rank = _mpi_rank,
mpi_world_size = _mpi_world_size;
// TODO: parallelize & optimize matrix multiplication on multi-node
// You must allocate & initialize A, B, C for non-root processes
omp_set_num_threads(num_threads);
if (mpi_world_size == 1)
{
// Node #0 do everythings.
proc_start = 0;
proc_end = M;
num_rows = proc_end - proc_start;
mat_mul_omp();
}
else
{
if (mpi_rank == 0)
{
for (int dest = 1; dest < mpi_world_size; dest++)
{
// Calculate index of target matrix C.
proc_start = M / mpi_world_size * dest + min(dest, M % mpi_world_size);
proc_end = M / mpi_world_size * (dest + 1) + min(dest + 1, M % mpi_world_size);
num_rows = proc_end - proc_start;
// Send start and end index to other nodes.
MPI_Isend(&proc_start, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
MPI_Isend(&proc_end, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
// Send A and B matrix to other nodes.
MPI_Isend(&A[proc_start * K], num_rows * K, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
MPI_Isend(B, K * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
}
proc_start = M / mpi_world_size * mpi_rank + min(mpi_rank, M % mpi_world_size);
proc_end = M / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, M % mpi_world_size);
num_rows = proc_end - proc_start;
}
else
{
int source = 0;
// Receive start and end index.
MPI_Recv(&proc_start, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
MPI_Recv(&proc_end, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
num_rows = proc_end - proc_start;
if (isAlloc == 0)
{
// Allocate memory address for A, B, and C at first function call.
alloc_mat(&A, num_rows, K);
alloc_mat(&B, K, N);
alloc_mat(&C, num_rows, N);
isAlloc = true;
}
// Zero padding every function call.
zero_mat(C, num_rows, N);
// Receive A and B matrix from node #0.
MPI_Recv(A, num_rows * K, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
MPI_Recv(B, K * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
}
// Do matrix multiplication.
mat_mul_omp();
if (mpi_rank == 0)
{
// Receive partial C from other nodes.
for (int source = 1; source < mpi_world_size; source++)
{
proc_start = M / mpi_world_size * source + min(source, M % mpi_world_size);
proc_end = M / mpi_world_size * (source + 1) + min(source + 1, M % mpi_world_size);
num_rows = proc_end - proc_start;
MPI_Recv(&C[proc_start * N], num_rows * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
}
}
else
{
// Send result of calculation of each node.
int dest = 0;
MPI_Isend(C, num_rows * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
}
}
}