156 lines
5.1 KiB
C++
156 lines
5.1 KiB
C++
#include "mat_mul.h"
|
|
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <omp.h>
|
|
#include <mpi.h>
|
|
|
|
static float *A, *B, *C;
|
|
static int M, N, K;
|
|
static int num_threads;
|
|
static int mpi_rank, mpi_world_size;
|
|
int portion,low_bound,upper_bound;
|
|
//,start,end,slice,num;
|
|
MPI_Status status;
|
|
MPI_Request request;
|
|
static void mat_mul_omp() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
|
|
int block = 80;
|
|
int KB,MB,NB,kk,k,ii;
|
|
#pragma omp parallel num_threads(num_threads)
|
|
{
|
|
int num = omp_get_thread_num();
|
|
int slice = (upper_bound-low_bound) / num_threads;
|
|
int start = num * slice + low_bound;
|
|
int end = (num == num_threads -1)? upper_bound : ((num+1)*slice+low_bound);
|
|
|
|
int iblock = 4;
|
|
int mblock = ((end-start)/iblock) * iblock + start;
|
|
int nblock = (N/block) * block;
|
|
int kblock = (K/block) * block;
|
|
|
|
int KU = (kblock/4) * 4;
|
|
#pragma omp parallel for schedule(static) private(KB,MB,NB,kk,k,ii)
|
|
for (kk = 0; kk < kblock; kk+=block) {
|
|
for (ii = start; ii < mblock; ii+=iblock) {
|
|
for (int jj = 0; jj < nblock; jj+=block) {
|
|
|
|
if(kk+block < kblock) KB = kk + block;
|
|
else KB = KU;
|
|
for (k = kk; k < KB; k+=4) {
|
|
if(ii+iblock < mblock) MB = ii + iblock;
|
|
else MB = end;
|
|
for (int i = ii; i < MB; i++) {
|
|
float aik = A[i*K + k];
|
|
float aik1 = A[i*K + k+1];
|
|
float aik2 = A[i*K + k+2];
|
|
float aik3 = A[i*K + k+3];
|
|
// float aik4 = A[i*K + k+4];
|
|
// float aik5 = A[i*K + k+5];
|
|
// float aik6 = A[i*K + k+6];
|
|
// float aik7 = A[i*K + k+7];
|
|
if(jj+block < nblock) NB = jj + block;
|
|
else NB = N;
|
|
for (int j = jj; j < NB; j++) {
|
|
C[i * N + j] = C[i*N + j] + aik * B[k * N + j]
|
|
+ aik1 * B[(k+1) * N + j]
|
|
+ aik2 * B[(k+2) * N + j]
|
|
+ aik3 * B[(k+3) * N + j];
|
|
// + aik4 * B[(k+4) * N + j]
|
|
// + aik5 * B[(k+5) * N + j]
|
|
// + aik6 * B[(k+6) * N + j]
|
|
// + aik7 * B[(k+7) * N + j];
|
|
}
|
|
} // i
|
|
|
|
} //k
|
|
|
|
} // ii
|
|
|
|
} // jj
|
|
|
|
|
|
} // kk
|
|
|
|
#pragma omp parallel for schedule(static) private(k)
|
|
for (int i = start; i < end; i++) {
|
|
for(k=KU;k<K;k++){
|
|
float aik = A[i*K + k];
|
|
for(int j=0;j<N;j++){
|
|
C[i*N+j] += aik * B[k*N +j];
|
|
}
|
|
}
|
|
}
|
|
} // omp
|
|
|
|
}
|
|
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads, int _mpi_rank, int _mpi_world_size) {
|
|
A = _A, B = _B, C = _C;
|
|
M = _M, N = _N, K = _K;
|
|
num_threads = _num_threads, mpi_rank = _mpi_rank,
|
|
mpi_world_size = _mpi_world_size;
|
|
|
|
size_t bytes_mk = M*K*sizeof(float);
|
|
size_t bytes_kn = K*N*sizeof(float);
|
|
size_t bytes_mn = M*N*sizeof(float);
|
|
if(mpi_rank == 0){
|
|
|
|
portion = (M / (mpi_world_size) );
|
|
for(int i =1; i < mpi_world_size; i++){
|
|
low_bound = i * portion;
|
|
if (((i + 1) == mpi_world_size) && ((M % (mpi_world_size)) != 0)) {
|
|
upper_bound = M;
|
|
} else {
|
|
upper_bound = low_bound + portion;
|
|
}
|
|
|
|
MPI_Isend(&low_bound, 1, MPI_INT, i, 1, MPI_COMM_WORLD, &request);
|
|
//next send the upper bound without blocking, to the intended slave
|
|
MPI_Isend(&upper_bound, 1, MPI_INT, i, 2, MPI_COMM_WORLD, &request);
|
|
//finally send the allocated row portion of [A] without blocking, to the intended slave
|
|
MPI_Isend(&A[low_bound*K], (upper_bound - low_bound) * K, MPI_FLOAT, i, 3, MPI_COMM_WORLD, &request);
|
|
MPI_Isend(&B[0], N * K, MPI_FLOAT, i, 4, MPI_COMM_WORLD, &request);
|
|
|
|
}
|
|
low_bound=0;
|
|
upper_bound=portion;
|
|
mat_mul_omp();
|
|
|
|
|
|
for(int i = 1; i < mpi_world_size; i++){
|
|
MPI_Recv(&low_bound, 1, MPI_INT, i, 4, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(&upper_bound, 1, MPI_INT, i, 5, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(&C[low_bound*N], (upper_bound - low_bound) * N, MPI_FLOAT, i, 6, MPI_COMM_WORLD, &status);
|
|
}
|
|
|
|
}
|
|
|
|
|
|
else{
|
|
MPI_Recv(&low_bound, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
|
|
//next receive upper bound from the master
|
|
MPI_Recv(&upper_bound, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &status);
|
|
//finally receive row portion of [A] to be processed from the master
|
|
A = (float*)malloc(bytes_mk);
|
|
B = (float*)malloc(bytes_kn);
|
|
C = (float*)malloc(bytes_mn);
|
|
MPI_Recv(&A[low_bound*K], (upper_bound - low_bound) * K, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(&B[0], N * K, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &status);
|
|
|
|
|
|
mat_mul_omp();
|
|
|
|
MPI_Isend(&low_bound, 1, MPI_INT, 0, 4, MPI_COMM_WORLD,&request);
|
|
MPI_Isend(&upper_bound, 1, MPI_INT, 0, 5, MPI_COMM_WORLD,&request);
|
|
MPI_Isend(&C[low_bound*N], (upper_bound - low_bound) * N, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &request);
|
|
|
|
free(A);
|
|
free(B);
|
|
free(C);
|
|
}
|
|
|
|
}
|