chundoong-lab-ta/SamsungDS22/submissions/HW4/bumhee86.lee/mat_mul.cpp

976 lines
51 KiB
C++

#include "mat_mul.h"
#define ENABLE_TIME_MEASURE (0)
#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <immintrin.h>
#include <omp.h>
#if (ENABLE_TIME_MEASURE)
#include "util.h"
#endif
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
static int mpi_rank, mpi_world_size;
#define OPTIMAL_MATRIX_SIZE (8192)
#define ITILESIZE (50)
#define JTILESIZE (1024)
#define KTILESIZE (1024)
#define OPTIMAL_MPI_SIZE (4)
#define VALIDATE_THREAD_CNT (0)
#define ENABLE_PREFETCH (1)
#define OPTIMAL_MPI_SIZE_BUG (3)
#define MASTER_PANALTY_ROW (256)
#define MASTER_PANALTY_SIZE (MASTER_PANALTY_ROW * OPTIMAL_MATRIX_SIZE)
#define FAIR_ROW_CNT (OPTIMAL_MATRIX_SIZE / OPTIMAL_MPI_SIZE)
#define P0_END_I_ROW (FAIR_ROW_CNT + (MASTER_PANALTY_ROW * (OPTIMAL_MPI_SIZE - 1)))
#define P1_START_I_ROW (P0_END_I_ROW)
#define P1_END_I_ROW (P1_START_I_ROW + FAIR_ROW_CNT - MASTER_PANALTY_ROW)
#define P2_START_I_ROW (P1_END_I_ROW)
#define P2_END_I_ROW (P2_START_I_ROW + FAIR_ROW_CNT - MASTER_PANALTY_ROW)
#define P3_START_I_ROW (P2_END_I_ROW)
#define MASTER_PROCESS_ROW_SIZE (P0_END_I_ROW)
#define SLAVE_PROCESS_ROW_SIZE (FAIR_ROW_CNT - MASTER_PANALTY_ROW)
#define MASTER_PROCESS_TRANS_TOTAL_SIZE (P0_END_I_ROW * OPTIMAL_MATRIX_SIZE)
#define SLAVE_PROCESS_TRANS_TOTAL_SIZE (SLAVE_PROCESS_ROW_SIZE * OPTIMAL_MATRIX_SIZE)
#define MIN(__A,__B) ((__A) < (__B) ? (__A) : (__B)) // Can std::min goes to inline function??
#if (ENABLE_PREFETCH)
#define MM_PREFETCH(__A, __B) _mm_prefetch(__A, __B)
#else
#define MM_PREFETCH(__A, __B)
#endif
#define _3_SLICE_FIRST_ROW (1024*3)
#define _3_SLICE_OTHER_ROW_SIZE ((OPTIMAL_MATRIX_SIZE - _3_SLICE_FIRST_ROW) / 2)
#define _3_SLICE_SECOND_ROW (_3_SLICE_FIRST_ROW + _3_SLICE_OTHER_ROW_SIZE)
#define _3_SLICE_THIRD_ROW (_3_SLICE_SECOND_ROW + _3_SLICE_OTHER_ROW_SIZE)
#define _3_SLICE_TRANSFER_ROW (_3_SLICE_OTHER_ROW_SIZE / 2)
#define _3_SLICE_TRANSFER_SIZE (_3_SLICE_TRANSFER_ROW * OPTIMAL_MATRIX_SIZE)
static void _alloc_mat(float **m, int R, int C) {
*m = (float *)aligned_alloc(32, sizeof(float) * R * C);
if (*m == NULL) {
printf ("Failed to allocate memory for matrix.\n");
exit(0);
}
}
static void mat_mul_optimal_omp_00() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = 0; i < _3_SLICE_FIRST_ROW; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, _3_SLICE_FIRST_ROW); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_01() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = _3_SLICE_FIRST_ROW; i < _3_SLICE_SECOND_ROW; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, _3_SLICE_SECOND_ROW); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_02() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = _3_SLICE_SECOND_ROW; i < OPTIMAL_MATRIX_SIZE; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_0() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = 0; i < OPTIMAL_MATRIX_SIZE / 4; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_1() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = OPTIMAL_MATRIX_SIZE / 4; i < OPTIMAL_MATRIX_SIZE / 4 * 2; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4 * 2); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_2() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = OPTIMAL_MATRIX_SIZE / 4 * 2; i < OPTIMAL_MATRIX_SIZE / 4 * 3; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4 * 3); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp_3() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
#if (!VALIDATE_THREAD_CNT)
#pragma omp parallel for collapse(3) schedule(static)
#else
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
#endif
for (int i = OPTIMAL_MATRIX_SIZE / 4 * 3; i < OPTIMAL_MATRIX_SIZE; i += ITILESIZE) {
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
#if (VALIDATE_THREAD_CNT)
if (nPrintf == 0)
{
//#pragma omp critical
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE); ++ii) {
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
c0 = _mm512_fmadd_ps(a0, b0, c0);
c0 = _mm512_fmadd_ps(a1, b1, c0);
c0 = _mm512_fmadd_ps(a2, b2, c0);
c0 = _mm512_fmadd_ps(a3, b3, c0);
c0 = _mm512_fmadd_ps(a4, b4, c0);
c0 = _mm512_fmadd_ps(a5, b5, c0);
c0 = _mm512_fmadd_ps(a6, b6, c0);
c0 = _mm512_fmadd_ps(a7, b7, c0);
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
d0 = _mm512_fmadd_ps(a0, e0, d0);
d0 = _mm512_fmadd_ps(a1, e1, d0);
d0 = _mm512_fmadd_ps(a2, e2, d0);
d0 = _mm512_fmadd_ps(a3, e3, d0);
d0 = _mm512_fmadd_ps(a4, e4, d0);
d0 = _mm512_fmadd_ps(a5, e5, d0);
d0 = _mm512_fmadd_ps(a6, e6, d0);
d0 = _mm512_fmadd_ps(a7, e7, d0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
}
}
}
}
}
}
}
static void mat_mul_optimal_omp() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
switch (mpi_rank)
{
case 0:
mat_mul_optimal_omp_0();
break;
case 1:
mat_mul_optimal_omp_1();
break;
case 2:
mat_mul_optimal_omp_2();
break;
case 3:
mat_mul_optimal_omp_3();
break;
default:
// Assert case
break;
}
}
static void mat_mul_optimal_omp_3node() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
switch (mpi_rank)
{
case 0:
mat_mul_optimal_omp_00();
break;
case 1:
mat_mul_optimal_omp_01();
break;
case 2:
mat_mul_optimal_omp_02();
break;
default:
// Assert case
break;
}
}
static void mat_mul_omp() {
// TODO: parallelize & optimize matrix multiplication
// Use num_threads per node
float Aik;
int bs = K;
int OptimalSliceCount = M/num_threads + 1;
#pragma omp parallel for schedule(static)
for (int ii = 0; ii < M; ii += OptimalSliceCount)
{
#if (VALIDATE_THREAD_CNT && 0)
if (nPrintf == 0)
{
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
nPrintf = 1;
}
#endif // VALIDATE_THREAD_CNT
for (int kk = 0; kk < K; kk += bs)
{
for (int i = ii; i < MIN(ii + OptimalSliceCount, M); ++i)
{
for (int k = kk; k < MIN(kk + bs, K); ++k)
{
Aik = A[i * K + k];
for (int j = 0; j < N; ++j)
{
C[i * N + j] += Aik * B[k * N + j];
}
}
}
}
}
}
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
int _num_threads, int _mpi_rank, int _mpi_world_size) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads, mpi_rank = _mpi_rank,
mpi_world_size = _mpi_world_size;
MPI_Status stMpiStatus;
MPI_Request stMpiRequest[30];
#if (ENABLE_TIME_MEASURE)
double elapsed_time;
timer_start(1);
#endif
// TODO: parallelize & optimize matrix multiplication on multi-node
// You must allocate & initialize A, B, C for non-root processes
omp_set_num_threads(_num_threads);
// FIXME: for now, only root process runs the matrix multiplication.
if (_M == OPTIMAL_MATRIX_SIZE
&& _N == OPTIMAL_MATRIX_SIZE
&& _K == OPTIMAL_MATRIX_SIZE
&& _mpi_world_size == OPTIMAL_MPI_SIZE_BUG)
{
if (_mpi_rank == 0)
{
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
MPI_Isend(A + _3_SLICE_FIRST_ROW * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 2, MPI_COMM_WORLD, &stMpiRequest[10]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 3, MPI_COMM_WORLD, &stMpiRequest[11]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 4, MPI_COMM_WORLD, &stMpiRequest[12]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 5, MPI_COMM_WORLD, &stMpiRequest[13]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 6, MPI_COMM_WORLD, &stMpiRequest[14]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 7, MPI_COMM_WORLD, &stMpiRequest[15]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 8, MPI_COMM_WORLD, &stMpiRequest[16]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 9, MPI_COMM_WORLD, &stMpiRequest[17]);
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 2) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[18]);
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 3) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[19]);
MPI_Irecv(C + _3_SLICE_FIRST_ROW * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[20]);
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[21]);
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 2) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[22]);
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 3) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[23]);
}
else
{
_alloc_mat(&B, K, N);
MPI_Irecv(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
_alloc_mat(&A, M, K);
MPI_Irecv(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
MPI_Irecv(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2 + 1)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
_alloc_mat(&C, M, N);
#pragma GCC unroll 10
for (int i = 0; i < 10; ++i)
{
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
}
}
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Sending complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
mat_mul_optimal_omp_3node();
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Calculation complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
if (_mpi_rank == 0)
{
#pragma GCC unroll 24
for (int i = 0; i < 24; ++i)
{
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
}
}
else
{
MPI_Isend(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Isend(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2 + 1)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
free(A);
free(B);
MPI_Wait(&stMpiRequest[0], &stMpiStatus);
MPI_Wait(&stMpiRequest[1], &stMpiStatus);
free(C);
}
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Collect Complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
}
else if (_M == OPTIMAL_MATRIX_SIZE
&& _N == OPTIMAL_MATRIX_SIZE
&& _K == OPTIMAL_MATRIX_SIZE
&& _mpi_world_size == OPTIMAL_MPI_SIZE)
{
if (_mpi_rank == 0)
{
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 2, MPI_COMM_WORLD, &stMpiRequest[10]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 3, MPI_COMM_WORLD, &stMpiRequest[11]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 4, MPI_COMM_WORLD, &stMpiRequest[12]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 5, MPI_COMM_WORLD, &stMpiRequest[13]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 6, MPI_COMM_WORLD, &stMpiRequest[14]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 7, MPI_COMM_WORLD, &stMpiRequest[15]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 8, MPI_COMM_WORLD, &stMpiRequest[16]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 9, MPI_COMM_WORLD, &stMpiRequest[17]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[18]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[19]);
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 2, MPI_COMM_WORLD, &stMpiRequest[20]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 3, MPI_COMM_WORLD, &stMpiRequest[21]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 4, MPI_COMM_WORLD, &stMpiRequest[22]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 5, MPI_COMM_WORLD, &stMpiRequest[23]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 6, MPI_COMM_WORLD, &stMpiRequest[24]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 7, MPI_COMM_WORLD, &stMpiRequest[25]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 8, MPI_COMM_WORLD, &stMpiRequest[26]);
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 9, MPI_COMM_WORLD, &stMpiRequest[27]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &stMpiRequest[28]);
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 1, MPI_COMM_WORLD, &stMpiRequest[29]);
}
else
{
_alloc_mat(&B, K, N);
MPI_Irecv(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
_alloc_mat(&A, M, K);
MPI_Irecv(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
MPI_Irecv(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2 + 1)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
_alloc_mat(&C, M, N);
#pragma GCC unroll 10
for (int i = 0; i < 10; ++i)
{
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
}
}
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Sending complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
mat_mul_optimal_omp();
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Calculation complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
if (_mpi_rank == 0)
{
for (int i = 0; i < 30; ++i)
{
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
}
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[2]);
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[3]);
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &stMpiRequest[4]);
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 1, MPI_COMM_WORLD, &stMpiRequest[5]);
#pragma GCC unroll 6
for (int i = 0; i < 6; ++i)
{
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
}
}
else
{
MPI_Isend(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
MPI_Isend(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2 + 1)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
free(A);
free(B);
MPI_Wait(&stMpiRequest[0], &stMpiStatus);
MPI_Wait(&stMpiRequest[1], &stMpiStatus);
free(C);
}
#if (ENABLE_TIME_MEASURE)
elapsed_time = timer_stop(1);
printf("%f sec, Collect Complete! Rank %d\n", elapsed_time, _mpi_rank);
#endif
}
else
{
if (_mpi_rank == 0)
mat_mul_omp();
}
}