chundoong-lab-ta/SamsungDS22/submissions/HW4/jb114.seo/mat_mul.cpp

151 lines
3.0 KiB
C++

#include "mat_mul.h"
#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include "util.h"
#define min(a,b) (((a) < (b)) ? (a) : (b))
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
static void mat_mul_omp()
{
#define BLOCKSIZE_K 32
#define BLOCKSIZE_I 1024
#define BLOCKSIZE_J 1024
#pragma omp parallel num_threads(num_threads) default(none) shared(A, B, C, M, N, K)
for (int ii = 0; ii < M; ii += BLOCKSIZE_I)
{
for (int jj = 0; jj < N; jj += BLOCKSIZE_J)
{
for (int kk = 0; kk < K; kk += BLOCKSIZE_K)
{
const int endI = min(ii + BLOCKSIZE_I, M);
const int endJ = min(jj + BLOCKSIZE_J, N);
const int endK = min(kk + BLOCKSIZE_K, K);
#pragma omp for schedule(static) nowait
for (int i = ii; i < endI; ++i)
{
float *pA = &A[i * K];
float *pC = &C[i * N];
for (int k = kk; k < endK; ++k)
{
float Aik = pA[k];
float *pB = &B[k * N];
for (int j = jj; j < endJ; ++j)
{
pC[j] += Aik * pB[j];
}
}
}
}
}
}
}
#define MM_ASYNC 1
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
int _num_threads, int _mpi_rank, int _mpi_world_size)
{
num_threads = _num_threads;
mpi_rank = _mpi_rank;
mpi_world_size = _mpi_world_size;
N = _N, K = _K;
#if MM_ASYNC == 1
MPI_Request reqs[3];
#endif
// prepare B matrix
if (mpi_rank != 0)
{
alloc_mat(&B, K, N);
}
else
{
B = _B;
}
#if MM_ASYNC == 1
MPI_Ibcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[1]);
#else
MPI_Bcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif
// partition
M = _M / mpi_world_size;
// prepare local matrix
alloc_mat(&A, M, K);
alloc_mat(&C, M, N);
// prepare A matrix
#if MM_ASYNC == 1
MPI_Iscatter(_A, M * K, MPI_FLOAT, A, M * K, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]);
#else
MPI_Scatter(_A, M * K, MPI_FLOAT, A, M * K, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif
// zerorize result matrix
zero_mat(C, M, N);
#if MM_ASYNC == 1
MPI_Wait(&reqs[1], MPI_STATUS_IGNORE); // B
MPI_Wait(&reqs[0], MPI_STATUS_IGNORE); // A
#endif
// mul matrix
mat_mul_omp();
// collect result
#if MM_ASYNC == 1
MPI_Igather(C, M * N, MPI_FLOAT, _C, M * N, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[2]);
#else
MPI_Gather(C, M * N, MPI_FLOAT, _C, M * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif
// leftover clearance
float *tempA = A;
float *tempC = C;
if (mpi_rank == 0)
{
const int endM = M * mpi_world_size;
const int leftover = _M - endM;
if (leftover > 0)
{
M = leftover;
A = _A + (endM * K);
C = _C + (endM * N);
// printf("Clearing leftovers %d\n", leftover);
mat_mul_omp();
}
}
#if MM_ASYNC == 1
MPI_Wait(&reqs[2], MPI_STATUS_IGNORE); // C
#endif
// release local matrix
free(tempA);
free(tempC);
if (mpi_rank != 0)
{
free(B);
}
}