chundoong-lab-ta/SamsungDS22/submissions/HW4/hkyoo.kim/mat_mul.cpp

156 lines
5.1 KiB
C++

#include "mat_mul.h"
#include <cstdlib>
#include <cstdio>
#include <omp.h>
#include <mpi.h>
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
int portion,low_bound,upper_bound;
//,start,end,slice,num;
MPI_Status status;
MPI_Request request;
static void mat_mul_omp() {
// TODO: parallelize & optimize matrix multiplication
int block = 80;
int KB,MB,NB,kk,k,ii;
#pragma omp parallel num_threads(num_threads)
{
int num = omp_get_thread_num();
int slice = (upper_bound-low_bound) / num_threads;
int start = num * slice + low_bound;
int end = (num == num_threads -1)? upper_bound : ((num+1)*slice+low_bound);
int iblock = 4;
int mblock = ((end-start)/iblock) * iblock + start;
int nblock = (N/block) * block;
int kblock = (K/block) * block;
int KU = (kblock/4) * 4;
#pragma omp parallel for schedule(static) private(KB,MB,NB,kk,k,ii)
for (kk = 0; kk < kblock; kk+=block) {
for (ii = start; ii < mblock; ii+=iblock) {
for (int jj = 0; jj < nblock; jj+=block) {
if(kk+block < kblock) KB = kk + block;
else KB = KU;
for (k = kk; k < KB; k+=4) {
if(ii+iblock < mblock) MB = ii + iblock;
else MB = end;
for (int i = ii; i < MB; i++) {
float aik = A[i*K + k];
float aik1 = A[i*K + k+1];
float aik2 = A[i*K + k+2];
float aik3 = A[i*K + k+3];
// float aik4 = A[i*K + k+4];
// float aik5 = A[i*K + k+5];
// float aik6 = A[i*K + k+6];
// float aik7 = A[i*K + k+7];
if(jj+block < nblock) NB = jj + block;
else NB = N;
for (int j = jj; j < NB; j++) {
C[i * N + j] = C[i*N + j] + aik * B[k * N + j]
+ aik1 * B[(k+1) * N + j]
+ aik2 * B[(k+2) * N + j]
+ aik3 * B[(k+3) * N + j];
// + aik4 * B[(k+4) * N + j]
// + aik5 * B[(k+5) * N + j]
// + aik6 * B[(k+6) * N + j]
// + aik7 * B[(k+7) * N + j];
}
} // i
} //k
} // ii
} // jj
} // kk
#pragma omp parallel for schedule(static) private(k)
for (int i = start; i < end; i++) {
for(k=KU;k<K;k++){
float aik = A[i*K + k];
for(int j=0;j<N;j++){
C[i*N+j] += aik * B[k*N +j];
}
}
}
} // omp
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads, mpi_rank = _mpi_rank,
mpi_world_size = _mpi_world_size;
size_t bytes_mk = M*K*sizeof(float);
size_t bytes_kn = K*N*sizeof(float);
size_t bytes_mn = M*N*sizeof(float);
if(mpi_rank == 0){
portion = (M / (mpi_world_size) );
for(int i =1; i < mpi_world_size; i++){
low_bound = i * portion;
if (((i + 1) == mpi_world_size) && ((M % (mpi_world_size)) != 0)) {
upper_bound = M;
} else {
upper_bound = low_bound + portion;
}
MPI_Isend(&low_bound, 1, MPI_INT, i, 1, MPI_COMM_WORLD, &request);
//next send the upper bound without blocking, to the intended slave
MPI_Isend(&upper_bound, 1, MPI_INT, i, 2, MPI_COMM_WORLD, &request);
//finally send the allocated row portion of [A] without blocking, to the intended slave
MPI_Isend(&A[low_bound*K], (upper_bound - low_bound) * K, MPI_FLOAT, i, 3, MPI_COMM_WORLD, &request);
MPI_Isend(&B[0], N * K, MPI_FLOAT, i, 4, MPI_COMM_WORLD, &request);
}
low_bound=0;
upper_bound=portion;
mat_mul_omp();
for(int i = 1; i < mpi_world_size; i++){
MPI_Recv(&low_bound, 1, MPI_INT, i, 4, MPI_COMM_WORLD, &status);
MPI_Recv(&upper_bound, 1, MPI_INT, i, 5, MPI_COMM_WORLD, &status);
MPI_Recv(&C[low_bound*N], (upper_bound - low_bound) * N, MPI_FLOAT, i, 6, MPI_COMM_WORLD, &status);
}
}
else{
MPI_Recv(&low_bound, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
//next receive upper bound from the master
MPI_Recv(&upper_bound, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &status);
//finally receive row portion of [A] to be processed from the master
A = (float*)malloc(bytes_mk);
B = (float*)malloc(bytes_kn);
C = (float*)malloc(bytes_mn);
MPI_Recv(&A[low_bound*K], (upper_bound - low_bound) * K, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &status);
MPI_Recv(&B[0], N * K, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &status);
mat_mul_omp();
MPI_Isend(&low_bound, 1, MPI_INT, 0, 4, MPI_COMM_WORLD,&request);
MPI_Isend(&upper_bound, 1, MPI_INT, 0, 5, MPI_COMM_WORLD,&request);
MPI_Isend(&C[low_bound*N], (upper_bound - low_bound) * N, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &request);
free(A);
free(B);
free(C);
}
}