219 lines
6.9 KiB
C++
219 lines
6.9 KiB
C++
#include "mat_mul.h"
|
|
#include "util.h"
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <mpi.h>
|
|
#include <omp.h>
|
|
|
|
static int isAlloc = 0;
|
|
static MPI_Request request;
|
|
static MPI_Status status;
|
|
static float *A, *B, *C;
|
|
static int M, N, K;
|
|
static int num_threads;
|
|
static int mpi_rank, mpi_world_size;
|
|
static int proc_start, proc_end, num_rows;
|
|
|
|
#define NTILE_SIZE 1024
|
|
#define KTILE_SIZE 32
|
|
|
|
#define UNROLL_SIZE 4
|
|
#define min(A, B) (((A) > (B)) ? (B) : (A))
|
|
|
|
static void mat_mul_omp()
|
|
{
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
|
|
#pragma omp parallel
|
|
{
|
|
int tid = omp_get_thread_num();
|
|
int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
|
|
int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
|
|
int ks = KTILE_SIZE;
|
|
int ns = NTILE_SIZE;
|
|
float Ark = 0.0;
|
|
|
|
#pragma omp parallel for shared(A, B, C) schedule(dynamic)
|
|
for (int k_tile = 0; k_tile < K; k_tile += ks)
|
|
{
|
|
for (int n_tile = 0; n_tile < N; n_tile += ns)
|
|
{
|
|
for (int r = row_start; r < row_end; r++)
|
|
{
|
|
int k_limit = min(k_tile + ks, K);
|
|
for (int k = k_tile; k < k_limit; k++)
|
|
{
|
|
Ark = A[r * K + k];
|
|
int unroll_size = UNROLL_SIZE;
|
|
int n_limit = min(n_tile + ns, N);
|
|
int n_unroll_limit = (n_limit / unroll_size) * unroll_size;
|
|
|
|
for (int n = n_tile; n < n_unroll_limit; n += unroll_size)
|
|
{
|
|
#if UNROLL_SIZE >= 4
|
|
C[r * N + n] += Ark * B[k * N + n];
|
|
C[r * N + n + 1] += Ark * B[k * N + n + 1];
|
|
C[r * N + n + 2] += Ark * B[k * N + n + 2];
|
|
C[r * N + n + 3] += Ark * B[k * N + n + 3];
|
|
#endif
|
|
#if UNROLL_SIZE == 8
|
|
C[r * N + n + 4] += Ark * B[k * N + n + 4];
|
|
C[r * N + n + 5] += Ark * B[k * N + n + 5];
|
|
C[r * N + n + 6] += Ark * B[k * N + n + 6];
|
|
C[r * N + n + 7] += Ark * B[k * N + n + 7];
|
|
#endif
|
|
}
|
|
|
|
for (int n = n_unroll_limit; n < n_limit; n++)
|
|
{
|
|
C[r * N + n] += Ark * B[k * N + n];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// #pragma omp parallel
|
|
// {
|
|
// int tid = omp_get_thread_num();
|
|
// int row_start = num_rows / num_threads * tid + min(tid, num_rows % num_threads);
|
|
// int row_end = num_rows / num_threads * (tid + 1) + min(tid + 1, num_rows % num_threads);
|
|
// int is = ITILE_SIZE;
|
|
// int js = JTILE_SIZE;
|
|
// int ks = KTILE_SIZE;
|
|
|
|
// #pragma omp parallel for shared(A,B,C) schedule(dynamic)
|
|
// for (int ii = row_start; ii < row_end; ii += is)
|
|
// {
|
|
// for (int jj = 0; jj < N; jj += js)
|
|
// {
|
|
// for(int kk = 0; kk < K; kk += ks)
|
|
// {
|
|
// int k_limit = min(kk + ks, K);
|
|
// for(int k = kk; k < k_limit; k++)
|
|
// {
|
|
// int i_limit = min(ii + is, row_end);
|
|
// for(int i = ii; i < i_limit; i++)
|
|
// {
|
|
// float Ar = A[i * K + k];
|
|
// int j_limit = min(jj + js, N);
|
|
// int unroll_size = UNROLL_SIZE;
|
|
// int unroll_limit = (j_limit / unroll_size) * unroll_size;
|
|
// // for(int j = jj; j < j_limit; j++)
|
|
// for (int j = jj; j < unroll_limit; j += unroll_size)
|
|
// {
|
|
// C[i * N + j] += Ar * B[k * N + j];
|
|
// C[i * N + j + 1] += Ar * B[k * N + j + 1];
|
|
// C[i * N + j + 2] += Ar * B[k * N + j + 2];
|
|
// C[i * N + j + 3] += Ar * B[k * N + j + 3];
|
|
// }
|
|
|
|
// for(int j = unroll_limit; j < j_limit; j++)
|
|
// {
|
|
// C[i * N + j] += Ar * B[k * N + j];
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
}
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
|
|
int _num_threads, int _mpi_rank, int _mpi_world_size)
|
|
{
|
|
if (isAlloc == 0)
|
|
{
|
|
A = _A, B = _B, C = _C;
|
|
}
|
|
M = _M, N = _N, K = _K;
|
|
|
|
num_threads = _num_threads, mpi_rank = _mpi_rank,
|
|
mpi_world_size = _mpi_world_size;
|
|
|
|
// TODO: parallelize & optimize matrix multiplication on multi-node
|
|
// You must allocate & initialize A, B, C for non-root processes
|
|
|
|
omp_set_num_threads(num_threads);
|
|
|
|
if (mpi_world_size == 1)
|
|
{
|
|
// Node #0 do everythings.
|
|
proc_start = 0;
|
|
proc_end = M;
|
|
num_rows = proc_end - proc_start;
|
|
mat_mul_omp();
|
|
}
|
|
else
|
|
{
|
|
if (mpi_rank == 0)
|
|
{
|
|
for (int dest = 1; dest < mpi_world_size; dest++)
|
|
{
|
|
// Calculate index of target matrix C.
|
|
proc_start = M / mpi_world_size * dest + min(dest, M % mpi_world_size);
|
|
proc_end = M / mpi_world_size * (dest + 1) + min(dest + 1, M % mpi_world_size);
|
|
num_rows = proc_end - proc_start;
|
|
// Send start and end index to other nodes.
|
|
MPI_Isend(&proc_start, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
|
MPI_Isend(&proc_end, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
|
// Send A and B matrix to other nodes.
|
|
MPI_Isend(&A[proc_start * K], num_rows * K, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
|
MPI_Isend(B, K * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
|
}
|
|
|
|
proc_start = M / mpi_world_size * mpi_rank + min(mpi_rank, M % mpi_world_size);
|
|
proc_end = M / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, M % mpi_world_size);
|
|
num_rows = proc_end - proc_start;
|
|
}
|
|
else
|
|
{
|
|
int source = 0;
|
|
// Receive start and end index.
|
|
MPI_Recv(&proc_start, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(&proc_end, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
|
num_rows = proc_end - proc_start;
|
|
|
|
if (isAlloc == 0)
|
|
{
|
|
// Allocate memory address for A, B, and C at first function call.
|
|
alloc_mat(&A, num_rows, K);
|
|
alloc_mat(&B, K, N);
|
|
alloc_mat(&C, num_rows, N);
|
|
isAlloc = true;
|
|
}
|
|
// Zero padding every function call.
|
|
zero_mat(C, num_rows, N);
|
|
|
|
// Receive A and B matrix from node #0.
|
|
MPI_Recv(A, num_rows * K, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(B, K * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
|
}
|
|
|
|
// Do matrix multiplication.
|
|
mat_mul_omp();
|
|
|
|
if (mpi_rank == 0)
|
|
{
|
|
// Receive partial C from other nodes.
|
|
for (int source = 1; source < mpi_world_size; source++)
|
|
{
|
|
proc_start = M / mpi_world_size * source + min(source, M % mpi_world_size);
|
|
proc_end = M / mpi_world_size * (source + 1) + min(source + 1, M % mpi_world_size);
|
|
num_rows = proc_end - proc_start;
|
|
MPI_Recv(&C[proc_start * N], num_rows * N, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Send result of calculation of each node.
|
|
int dest = 0;
|
|
MPI_Isend(C, num_rows * N, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
|
}
|
|
}
|
|
}
|