151 lines
3.0 KiB
C++
151 lines
3.0 KiB
C++
#include "mat_mul.h"
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <mpi.h>
|
|
#include "util.h"
|
|
|
|
#define min(a,b) (((a) < (b)) ? (a) : (b))
|
|
|
|
static float *A, *B, *C;
|
|
static int M, N, K;
|
|
static int num_threads;
|
|
static int mpi_rank, mpi_world_size;
|
|
|
|
static void mat_mul_omp()
|
|
{
|
|
#define BLOCKSIZE_K 32
|
|
#define BLOCKSIZE_I 1024
|
|
#define BLOCKSIZE_J 1024
|
|
|
|
#pragma omp parallel num_threads(num_threads) default(none) shared(A, B, C, M, N, K)
|
|
for (int ii = 0; ii < M; ii += BLOCKSIZE_I)
|
|
{
|
|
for (int jj = 0; jj < N; jj += BLOCKSIZE_J)
|
|
{
|
|
for (int kk = 0; kk < K; kk += BLOCKSIZE_K)
|
|
{
|
|
const int endI = min(ii + BLOCKSIZE_I, M);
|
|
const int endJ = min(jj + BLOCKSIZE_J, N);
|
|
const int endK = min(kk + BLOCKSIZE_K, K);
|
|
|
|
#pragma omp for schedule(static) nowait
|
|
for (int i = ii; i < endI; ++i)
|
|
{
|
|
float *pA = &A[i * K];
|
|
float *pC = &C[i * N];
|
|
|
|
for (int k = kk; k < endK; ++k)
|
|
{
|
|
float Aik = pA[k];
|
|
float *pB = &B[k * N];
|
|
|
|
for (int j = jj; j < endJ; ++j)
|
|
{
|
|
pC[j] += Aik * pB[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#define MM_ASYNC 1
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
|
|
int _num_threads, int _mpi_rank, int _mpi_world_size)
|
|
{
|
|
num_threads = _num_threads;
|
|
mpi_rank = _mpi_rank;
|
|
mpi_world_size = _mpi_world_size;
|
|
N = _N, K = _K;
|
|
|
|
#if MM_ASYNC == 1
|
|
MPI_Request reqs[3];
|
|
#endif
|
|
|
|
// prepare B matrix
|
|
if (mpi_rank != 0)
|
|
{
|
|
alloc_mat(&B, K, N);
|
|
}
|
|
else
|
|
{
|
|
B = _B;
|
|
}
|
|
|
|
#if MM_ASYNC == 1
|
|
MPI_Ibcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[1]);
|
|
#else
|
|
MPI_Bcast(B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
#endif
|
|
|
|
// partition
|
|
M = _M / mpi_world_size;
|
|
|
|
// prepare local matrix
|
|
alloc_mat(&A, M, K);
|
|
alloc_mat(&C, M, N);
|
|
|
|
// prepare A matrix
|
|
#if MM_ASYNC == 1
|
|
MPI_Iscatter(_A, M * K, MPI_FLOAT, A, M * K, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]);
|
|
#else
|
|
MPI_Scatter(_A, M * K, MPI_FLOAT, A, M * K, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
#endif
|
|
|
|
// zerorize result matrix
|
|
zero_mat(C, M, N);
|
|
|
|
#if MM_ASYNC == 1
|
|
MPI_Wait(&reqs[1], MPI_STATUS_IGNORE); // B
|
|
MPI_Wait(&reqs[0], MPI_STATUS_IGNORE); // A
|
|
#endif
|
|
|
|
// mul matrix
|
|
mat_mul_omp();
|
|
|
|
// collect result
|
|
#if MM_ASYNC == 1
|
|
MPI_Igather(C, M * N, MPI_FLOAT, _C, M * N, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[2]);
|
|
#else
|
|
MPI_Gather(C, M * N, MPI_FLOAT, _C, M * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
#endif
|
|
|
|
|
|
// leftover clearance
|
|
float *tempA = A;
|
|
float *tempC = C;
|
|
if (mpi_rank == 0)
|
|
{
|
|
const int endM = M * mpi_world_size;
|
|
const int leftover = _M - endM;
|
|
|
|
if (leftover > 0)
|
|
{
|
|
M = leftover;
|
|
A = _A + (endM * K);
|
|
C = _C + (endM * N);
|
|
|
|
// printf("Clearing leftovers %d\n", leftover);
|
|
|
|
mat_mul_omp();
|
|
}
|
|
}
|
|
|
|
#if MM_ASYNC == 1
|
|
MPI_Wait(&reqs[2], MPI_STATUS_IGNORE); // C
|
|
#endif
|
|
|
|
// release local matrix
|
|
free(tempA);
|
|
free(tempC);
|
|
|
|
if (mpi_rank != 0)
|
|
{
|
|
free(B);
|
|
}
|
|
}
|