203 lines
6.3 KiB
C++
203 lines
6.3 KiB
C++
// vim:ts=2:sw=2:expandtab
|
|
#include "mat_mul.h"
|
|
#include "util.h"
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <mpi.h>
|
|
#include <omp.h>
|
|
|
|
static float *A, *B, *C;
|
|
static int M, N, K;
|
|
static int num_threads;
|
|
static int mpi_rank, mpi_world_size;
|
|
static int tslice;
|
|
static int BK;
|
|
static int BJ;
|
|
static int BI;
|
|
static int psize; // size of a process
|
|
|
|
inline int min(const int a, const int b){return a>b ? b : a;}
|
|
|
|
|
|
static void mat_mul_omp() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
omp_set_num_threads(num_threads);
|
|
|
|
tslice = (int) (psize / num_threads);
|
|
BI = 32;
|
|
BJ = 1024;
|
|
BK = 1024;
|
|
|
|
#pragma omp parallel
|
|
{
|
|
int tid = omp_get_thread_num();
|
|
int tstart = tslice * tid;
|
|
int tend = (tid ==(num_threads-1)) ? psize : tstart + tslice;
|
|
|
|
// printf("(tid, tstart, tend) = (%d, %d, %d)\n", tid, tstart, tend);
|
|
|
|
float Aik;
|
|
for(int ii=tstart; ii<tend; ii+=BI)
|
|
{
|
|
for(int jj=0; jj<N; jj+=BJ)
|
|
{
|
|
for(int kk=0; kk<K; kk+=BK)
|
|
{
|
|
for (int k=kk; k<min(kk+BK, K); ++k)
|
|
{
|
|
for (int i=ii; i < min(ii+BI, tend); ++i)
|
|
{
|
|
Aik = A[i*K+k];
|
|
// printf("A[%d,%d] = %f\n", i, k, Aik);
|
|
for (int j=jj; j<min(jj+BJ,N); ++j)
|
|
{
|
|
// printf("B[%d,%d] = %f\n", k, j, B[k*N+j]);
|
|
C[i * N + j] += Aik * B[k * N + j];
|
|
// printf("C[%d,%d] = %f\n", i, j, C[i*N+j]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
|
|
int _num_threads, int _mpi_rank, int _mpi_world_size) {
|
|
// A = _A,
|
|
// B = _B;
|
|
C = _C;
|
|
M = _M; N = _N; K = _K;
|
|
num_threads = _num_threads, mpi_rank = _mpi_rank,
|
|
mpi_world_size = _mpi_world_size;
|
|
|
|
// TODO: parallelize & optimize matrix multiplication on multi-node
|
|
// You must allocate & initialize A, B, C for non-root processes
|
|
|
|
MPI_Request requests[3];
|
|
int pshare; // # of shares per process
|
|
int *pstart; // start index of a process
|
|
int *pend; // end index of a process
|
|
int mSizeA; // sending matrix size of A
|
|
int mSizeB = K * N; // sending matrix size of B
|
|
int tag = 1236;
|
|
|
|
MPI_Status status;
|
|
// printf("\n");
|
|
pshare = (int) (M / mpi_world_size);
|
|
|
|
if(mpi_rank == 0)
|
|
{
|
|
B = _B;
|
|
}
|
|
else
|
|
{
|
|
alloc_mat(&B, K, N);
|
|
}
|
|
MPI_Bcast(B, mSizeB, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
|
|
if(mpi_rank == 0) // master
|
|
{
|
|
pstart = (int *) malloc(mpi_world_size * sizeof(int));
|
|
pend = (int *) malloc(mpi_world_size * sizeof(int));
|
|
|
|
for(int ii=1; ii<mpi_world_size; ++ii)
|
|
{
|
|
//========================================================================
|
|
// size information
|
|
//========================================================================
|
|
pstart[ii] = ii * pshare;
|
|
pend[ii] = (ii == mpi_world_size-1) ? M : pstart[ii] + pshare;
|
|
psize = pend[ii] - pstart[ii];
|
|
mSizeA = K * (pend[ii] - pstart[ii]);
|
|
|
|
//========================================================================
|
|
// send matrices
|
|
//========================================================================
|
|
MPI_Send(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
|
|
// MPI_Send(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
|
|
// MPI_Send(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
|
|
// MPI_Isend(&psize, 1, MPI_INT, ii, tag, MPI_COMM_WORLD);
|
|
MPI_Isend(&_A[K * pstart[ii]], mSizeA, MPI_FLOAT,
|
|
ii, tag, MPI_COMM_WORLD, &requests[ii-1]);
|
|
// MPI_Isend(&_B[0], mSizeB, MPI_FLOAT, ii, tag, MPI_COMM_WORLD);
|
|
// MPI_Wait(&requests[ii-1], &status);
|
|
}
|
|
|
|
//==========================================================================
|
|
// matrix multiplication
|
|
//==========================================================================
|
|
A = _A;
|
|
// B = _B;
|
|
psize = pshare;
|
|
|
|
// printf("(info, M) matrix A\n");
|
|
// print_mat(A, psize, K);
|
|
mat_mul_omp();
|
|
|
|
// printf("(info, M) matrix C\n");
|
|
// print_mat(C, K, N);
|
|
|
|
//==========================================================================
|
|
// merge matrix
|
|
//==========================================================================
|
|
for(int ii=1; ii<mpi_world_size; ++ii)
|
|
{
|
|
MPI_Recv(&C[pstart[ii] * N], (pend[ii] - pstart[ii]) * N,
|
|
MPI_FLOAT, ii, tag, MPI_COMM_WORLD, &status);
|
|
}
|
|
|
|
// printf("(info, M) matrix C\n");
|
|
// print_mat(C, K, N);
|
|
}
|
|
else
|
|
{
|
|
//==========================================================================
|
|
// recieve parameters
|
|
//==========================================================================
|
|
MPI_Recv(&psize, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
|
|
|
|
//==========================================================================
|
|
// recieve matrix A
|
|
//==========================================================================
|
|
// A = (float *) malloc( psize * K * sizeof(float));
|
|
alloc_mat(&A, psize, K);
|
|
MPI_Recv(&A[0], psize * K, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
|
|
// printf("(info, S) %d node\n", mpi_rank);
|
|
// print_mat(A, psize, K);
|
|
// printf("(info, S%d) matrix A\n", mpi_rank);
|
|
// print_mat(A, psize, K);
|
|
|
|
//==========================================================================
|
|
// recieve matrix B
|
|
//==========================================================================
|
|
// B = (float *) malloc ( K * N * sizeof(float));
|
|
// alloc_mat(&B, K, N);
|
|
// MPI_Recv(&B[0], K * N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD, &status);
|
|
// printf("(info, S%d) matrix B\n", mpi_rank);
|
|
// print_mat(B, K, N);
|
|
|
|
//==========================================================================
|
|
// matrix multiplication
|
|
//==========================================================================
|
|
// C = (float *) malloc ( psize * N * sizeof(float));
|
|
alloc_mat(&C, psize, N);
|
|
mat_mul_omp();
|
|
// printf("(info, S%d) matrix C\n", mpi_rank);
|
|
// print_mat(C, psize, N);
|
|
|
|
//==========================================================================
|
|
// send matrix
|
|
//==========================================================================
|
|
// MPI_Send(C, psize*N, MPI_FLOAT, 0, tag, MPI_COMM_WORLD);
|
|
MPI_Isend(C, psize*N, MPI_FLOAT, 0, tag,
|
|
MPI_COMM_WORLD, &requests[mpi_rank-1]);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|