976 lines
51 KiB
C++
976 lines
51 KiB
C++
#include "mat_mul.h"
|
|
|
|
|
|
#define ENABLE_TIME_MEASURE (0)
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <mpi.h>
|
|
#include <immintrin.h>
|
|
#include <omp.h>
|
|
#if (ENABLE_TIME_MEASURE)
|
|
#include "util.h"
|
|
#endif
|
|
|
|
static float *A, *B, *C;
|
|
static int M, N, K;
|
|
static int num_threads;
|
|
static int mpi_rank, mpi_world_size;
|
|
|
|
#define OPTIMAL_MATRIX_SIZE (8192)
|
|
#define ITILESIZE (50)
|
|
#define JTILESIZE (1024)
|
|
#define KTILESIZE (1024)
|
|
#define OPTIMAL_MPI_SIZE (4)
|
|
#define VALIDATE_THREAD_CNT (0)
|
|
#define ENABLE_PREFETCH (1)
|
|
#define OPTIMAL_MPI_SIZE_BUG (3)
|
|
|
|
#define MASTER_PANALTY_ROW (256)
|
|
#define MASTER_PANALTY_SIZE (MASTER_PANALTY_ROW * OPTIMAL_MATRIX_SIZE)
|
|
|
|
#define FAIR_ROW_CNT (OPTIMAL_MATRIX_SIZE / OPTIMAL_MPI_SIZE)
|
|
#define P0_END_I_ROW (FAIR_ROW_CNT + (MASTER_PANALTY_ROW * (OPTIMAL_MPI_SIZE - 1)))
|
|
#define P1_START_I_ROW (P0_END_I_ROW)
|
|
#define P1_END_I_ROW (P1_START_I_ROW + FAIR_ROW_CNT - MASTER_PANALTY_ROW)
|
|
#define P2_START_I_ROW (P1_END_I_ROW)
|
|
#define P2_END_I_ROW (P2_START_I_ROW + FAIR_ROW_CNT - MASTER_PANALTY_ROW)
|
|
#define P3_START_I_ROW (P2_END_I_ROW)
|
|
|
|
#define MASTER_PROCESS_ROW_SIZE (P0_END_I_ROW)
|
|
#define SLAVE_PROCESS_ROW_SIZE (FAIR_ROW_CNT - MASTER_PANALTY_ROW)
|
|
#define MASTER_PROCESS_TRANS_TOTAL_SIZE (P0_END_I_ROW * OPTIMAL_MATRIX_SIZE)
|
|
#define SLAVE_PROCESS_TRANS_TOTAL_SIZE (SLAVE_PROCESS_ROW_SIZE * OPTIMAL_MATRIX_SIZE)
|
|
|
|
#define MIN(__A,__B) ((__A) < (__B) ? (__A) : (__B)) // Can std::min goes to inline function??
|
|
#if (ENABLE_PREFETCH)
|
|
#define MM_PREFETCH(__A, __B) _mm_prefetch(__A, __B)
|
|
#else
|
|
#define MM_PREFETCH(__A, __B)
|
|
#endif
|
|
|
|
#define _3_SLICE_FIRST_ROW (1024*3)
|
|
#define _3_SLICE_OTHER_ROW_SIZE ((OPTIMAL_MATRIX_SIZE - _3_SLICE_FIRST_ROW) / 2)
|
|
#define _3_SLICE_SECOND_ROW (_3_SLICE_FIRST_ROW + _3_SLICE_OTHER_ROW_SIZE)
|
|
#define _3_SLICE_THIRD_ROW (_3_SLICE_SECOND_ROW + _3_SLICE_OTHER_ROW_SIZE)
|
|
#define _3_SLICE_TRANSFER_ROW (_3_SLICE_OTHER_ROW_SIZE / 2)
|
|
#define _3_SLICE_TRANSFER_SIZE (_3_SLICE_TRANSFER_ROW * OPTIMAL_MATRIX_SIZE)
|
|
|
|
static void _alloc_mat(float **m, int R, int C) {
|
|
*m = (float *)aligned_alloc(32, sizeof(float) * R * C);
|
|
if (*m == NULL) {
|
|
printf ("Failed to allocate memory for matrix.\n");
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
|
|
static void mat_mul_optimal_omp_00() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = 0; i < _3_SLICE_FIRST_ROW; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, _3_SLICE_FIRST_ROW); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static void mat_mul_optimal_omp_01() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = _3_SLICE_FIRST_ROW; i < _3_SLICE_SECOND_ROW; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, _3_SLICE_SECOND_ROW); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_02() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = _3_SLICE_SECOND_ROW; i < OPTIMAL_MATRIX_SIZE; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_0() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = 0; i < OPTIMAL_MATRIX_SIZE / 4; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_1() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = OPTIMAL_MATRIX_SIZE / 4; i < OPTIMAL_MATRIX_SIZE / 4 * 2; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4 * 2); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_2() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = OPTIMAL_MATRIX_SIZE / 4 * 2; i < OPTIMAL_MATRIX_SIZE / 4 * 3; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE / 4 * 3); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_3() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
#if (!VALIDATE_THREAD_CNT)
|
|
#pragma omp parallel for collapse(3) schedule(static)
|
|
#else
|
|
#pragma omp parallel for collapse(3) schedule(static) firstprivate(nPrintf)
|
|
#endif
|
|
for (int i = OPTIMAL_MATRIX_SIZE / 4 * 3; i < OPTIMAL_MATRIX_SIZE; i += ITILESIZE) {
|
|
for (int j = 0; j < OPTIMAL_MATRIX_SIZE; j += JTILESIZE) {
|
|
for (int k = 0; k < OPTIMAL_MATRIX_SIZE; k += KTILESIZE) {
|
|
#if (VALIDATE_THREAD_CNT)
|
|
if (nPrintf == 0)
|
|
{
|
|
//#pragma omp critical
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
MM_PREFETCH((const char*)&A[(i+0)*OPTIMAL_MATRIX_SIZE+(k+0)], _MM_HINT_T0);
|
|
for (int kk = k; kk < k + KTILESIZE; kk += 8) {
|
|
MM_PREFETCH((const char*)&C[(i+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
MM_PREFETCH((const char*)&B[(k+0) * OPTIMAL_MATRIX_SIZE + j], _MM_HINT_T0);
|
|
for (int ii = i; ii < MIN(i + ITILESIZE, OPTIMAL_MATRIX_SIZE); ++ii) {
|
|
__m512 a0 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+0)]);
|
|
__m512 a1 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+1)]);
|
|
__m512 a2 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+2)]);
|
|
__m512 a3 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+3)]);
|
|
__m512 a4 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+4)]);
|
|
__m512 a5 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+5)]);
|
|
__m512 a6 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+6)]);
|
|
__m512 a7 = _mm512_set1_ps(A[(ii+0)*OPTIMAL_MATRIX_SIZE+(kk+7)]);
|
|
|
|
for (int jj = j; jj < j + JTILESIZE; jj += 32) {
|
|
__m512 c0 = _mm512_load_ps(&C[(ii+0) * N + jj]);
|
|
|
|
|
|
__m512 b0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
__m512 b7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj]);
|
|
|
|
c0 = _mm512_fmadd_ps(a0, b0, c0);
|
|
c0 = _mm512_fmadd_ps(a1, b1, c0);
|
|
c0 = _mm512_fmadd_ps(a2, b2, c0);
|
|
c0 = _mm512_fmadd_ps(a3, b3, c0);
|
|
c0 = _mm512_fmadd_ps(a4, b4, c0);
|
|
c0 = _mm512_fmadd_ps(a5, b5, c0);
|
|
c0 = _mm512_fmadd_ps(a6, b6, c0);
|
|
c0 = _mm512_fmadd_ps(a7, b7, c0);
|
|
|
|
__m512 d0 = _mm512_load_ps(&C[(ii+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
__m512 e0 = _mm512_load_ps(&B[(kk+0) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e1 = _mm512_load_ps(&B[(kk+1) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e2 = _mm512_load_ps(&B[(kk+2) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e3 = _mm512_load_ps(&B[(kk+3) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e4 = _mm512_load_ps(&B[(kk+4) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e5 = _mm512_load_ps(&B[(kk+5) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e6 = _mm512_load_ps(&B[(kk+6) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
__m512 e7 = _mm512_load_ps(&B[(kk+7) * OPTIMAL_MATRIX_SIZE + jj+16]);
|
|
|
|
d0 = _mm512_fmadd_ps(a0, e0, d0);
|
|
d0 = _mm512_fmadd_ps(a1, e1, d0);
|
|
d0 = _mm512_fmadd_ps(a2, e2, d0);
|
|
d0 = _mm512_fmadd_ps(a3, e3, d0);
|
|
d0 = _mm512_fmadd_ps(a4, e4, d0);
|
|
d0 = _mm512_fmadd_ps(a5, e5, d0);
|
|
d0 = _mm512_fmadd_ps(a6, e6, d0);
|
|
d0 = _mm512_fmadd_ps(a7, e7, d0);
|
|
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj], c0);
|
|
_mm512_store_ps(&C[(ii+0)*OPTIMAL_MATRIX_SIZE+jj+16], d0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
switch (mpi_rank)
|
|
{
|
|
case 0:
|
|
mat_mul_optimal_omp_0();
|
|
break;
|
|
case 1:
|
|
mat_mul_optimal_omp_1();
|
|
break;
|
|
case 2:
|
|
mat_mul_optimal_omp_2();
|
|
break;
|
|
case 3:
|
|
mat_mul_optimal_omp_3();
|
|
break;
|
|
default:
|
|
// Assert case
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void mat_mul_optimal_omp_3node() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
switch (mpi_rank)
|
|
{
|
|
case 0:
|
|
mat_mul_optimal_omp_00();
|
|
break;
|
|
case 1:
|
|
mat_mul_optimal_omp_01();
|
|
break;
|
|
case 2:
|
|
mat_mul_optimal_omp_02();
|
|
break;
|
|
default:
|
|
// Assert case
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void mat_mul_omp() {
|
|
// TODO: parallelize & optimize matrix multiplication
|
|
// Use num_threads per node
|
|
float Aik;
|
|
int bs = K;
|
|
int OptimalSliceCount = M/num_threads + 1;
|
|
|
|
#pragma omp parallel for schedule(static)
|
|
for (int ii = 0; ii < M; ii += OptimalSliceCount)
|
|
{
|
|
#if (VALIDATE_THREAD_CNT && 0)
|
|
if (nPrintf == 0)
|
|
{
|
|
printf("Thread count : %d, Tid : %d\n", omp_get_num_threads(), omp_get_thread_num());
|
|
nPrintf = 1;
|
|
}
|
|
#endif // VALIDATE_THREAD_CNT
|
|
for (int kk = 0; kk < K; kk += bs)
|
|
{
|
|
for (int i = ii; i < MIN(ii + OptimalSliceCount, M); ++i)
|
|
{
|
|
for (int k = kk; k < MIN(kk + bs, K); ++k)
|
|
{
|
|
Aik = A[i * K + k];
|
|
for (int j = 0; j < N; ++j)
|
|
{
|
|
C[i * N + j] += Aik * B[k * N + j];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K,
|
|
int _num_threads, int _mpi_rank, int _mpi_world_size) {
|
|
A = _A, B = _B, C = _C;
|
|
M = _M, N = _N, K = _K;
|
|
num_threads = _num_threads, mpi_rank = _mpi_rank,
|
|
mpi_world_size = _mpi_world_size;
|
|
MPI_Status stMpiStatus;
|
|
MPI_Request stMpiRequest[30];
|
|
#if (ENABLE_TIME_MEASURE)
|
|
double elapsed_time;
|
|
|
|
timer_start(1);
|
|
#endif
|
|
|
|
// TODO: parallelize & optimize matrix multiplication on multi-node
|
|
// You must allocate & initialize A, B, C for non-root processes
|
|
omp_set_num_threads(_num_threads);
|
|
|
|
// FIXME: for now, only root process runs the matrix multiplication.
|
|
if (_M == OPTIMAL_MATRIX_SIZE
|
|
&& _N == OPTIMAL_MATRIX_SIZE
|
|
&& _K == OPTIMAL_MATRIX_SIZE
|
|
&& _mpi_world_size == OPTIMAL_MPI_SIZE_BUG)
|
|
{
|
|
if (_mpi_rank == 0)
|
|
{
|
|
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
|
|
MPI_Isend(A + _3_SLICE_FIRST_ROW * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
|
|
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
|
|
|
|
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 2, MPI_COMM_WORLD, &stMpiRequest[10]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 3, MPI_COMM_WORLD, &stMpiRequest[11]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 4, MPI_COMM_WORLD, &stMpiRequest[12]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 5, MPI_COMM_WORLD, &stMpiRequest[13]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 6, MPI_COMM_WORLD, &stMpiRequest[14]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 7, MPI_COMM_WORLD, &stMpiRequest[15]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 8, MPI_COMM_WORLD, &stMpiRequest[16]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 9, MPI_COMM_WORLD, &stMpiRequest[17]);
|
|
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 2) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[18]);
|
|
MPI_Isend(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 3) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[19]);
|
|
|
|
MPI_Irecv(C + _3_SLICE_FIRST_ROW * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[20]);
|
|
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[21]);
|
|
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 2) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[22]);
|
|
MPI_Irecv(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * 3) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[23]);
|
|
}
|
|
else
|
|
{
|
|
_alloc_mat(&B, K, N);
|
|
MPI_Irecv(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
|
|
|
|
_alloc_mat(&A, M, K);
|
|
MPI_Irecv(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
|
|
MPI_Irecv(A + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2 + 1)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
|
|
|
|
_alloc_mat(&C, M, N);
|
|
|
|
#pragma GCC unroll 10
|
|
for (int i = 0; i < 10; ++i)
|
|
{
|
|
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
|
|
}
|
|
}
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Sending complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
|
|
mat_mul_optimal_omp_3node();
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Calculation complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
|
|
if (_mpi_rank == 0)
|
|
{
|
|
#pragma GCC unroll 24
|
|
for (int i = 0; i < 24; ++i)
|
|
{
|
|
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
MPI_Isend(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Isend(C + (_3_SLICE_FIRST_ROW + _3_SLICE_TRANSFER_ROW * ((_mpi_rank - 1) * 2 + 1)) * OPTIMAL_MATRIX_SIZE, _3_SLICE_TRANSFER_SIZE, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
|
|
free(A);
|
|
free(B);
|
|
MPI_Wait(&stMpiRequest[0], &stMpiStatus);
|
|
MPI_Wait(&stMpiRequest[1], &stMpiStatus);
|
|
free(C);
|
|
}
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Collect Complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
}
|
|
else if (_M == OPTIMAL_MATRIX_SIZE
|
|
&& _N == OPTIMAL_MATRIX_SIZE
|
|
&& _K == OPTIMAL_MATRIX_SIZE
|
|
&& _mpi_world_size == OPTIMAL_MPI_SIZE)
|
|
{
|
|
if (_mpi_rank == 0)
|
|
{
|
|
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
|
|
|
|
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 2, MPI_COMM_WORLD, &stMpiRequest[10]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 3, MPI_COMM_WORLD, &stMpiRequest[11]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 4, MPI_COMM_WORLD, &stMpiRequest[12]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 5, MPI_COMM_WORLD, &stMpiRequest[13]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 6, MPI_COMM_WORLD, &stMpiRequest[14]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 7, MPI_COMM_WORLD, &stMpiRequest[15]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 8, MPI_COMM_WORLD, &stMpiRequest[16]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 9, MPI_COMM_WORLD, &stMpiRequest[17]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[18]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[19]);
|
|
|
|
MPI_Isend(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 2, MPI_COMM_WORLD, &stMpiRequest[20]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 3, MPI_COMM_WORLD, &stMpiRequest[21]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 4, MPI_COMM_WORLD, &stMpiRequest[22]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 5, MPI_COMM_WORLD, &stMpiRequest[23]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 6, MPI_COMM_WORLD, &stMpiRequest[24]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 7, MPI_COMM_WORLD, &stMpiRequest[25]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 8, MPI_COMM_WORLD, &stMpiRequest[26]);
|
|
MPI_Isend(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 9, MPI_COMM_WORLD, &stMpiRequest[27]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &stMpiRequest[28]);
|
|
MPI_Isend(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 1, MPI_COMM_WORLD, &stMpiRequest[29]);
|
|
}
|
|
else
|
|
{
|
|
_alloc_mat(&B, K, N);
|
|
MPI_Irecv(B, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &stMpiRequest[2]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stMpiRequest[3]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &stMpiRequest[4]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 7, MPI_COMM_WORLD, &stMpiRequest[5]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 8, MPI_COMM_WORLD, &stMpiRequest[6]);
|
|
MPI_Irecv(B + OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7, OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 9, MPI_COMM_WORLD, &stMpiRequest[7]);
|
|
|
|
_alloc_mat(&A, M, K);
|
|
MPI_Irecv(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
|
|
MPI_Irecv(A + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2 + 1)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
|
|
|
|
_alloc_mat(&C, M, N);
|
|
|
|
#pragma GCC unroll 10
|
|
for (int i = 0; i < 10; ++i)
|
|
{
|
|
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
|
|
}
|
|
}
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Sending complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
|
|
mat_mul_optimal_omp();
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Calculation complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
|
|
if (_mpi_rank == 0)
|
|
{
|
|
for (int i = 0; i < 30; ++i)
|
|
{
|
|
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
|
|
}
|
|
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 2), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 3), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 4), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 0, MPI_COMM_WORLD, &stMpiRequest[2]);
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 5), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 2, 1, MPI_COMM_WORLD, &stMpiRequest[3]);
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 6), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 0, MPI_COMM_WORLD, &stMpiRequest[4]);
|
|
MPI_Irecv(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * 7), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 3, 1, MPI_COMM_WORLD, &stMpiRequest[5]);
|
|
|
|
#pragma GCC unroll 6
|
|
for (int i = 0; i < 6; ++i)
|
|
{
|
|
MPI_Wait(&stMpiRequest[i], &stMpiStatus);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
MPI_Isend(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
|
|
MPI_Isend(C + (OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8 * (_mpi_rank * 2 + 1)), OPTIMAL_MATRIX_SIZE * OPTIMAL_MATRIX_SIZE / 8, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
|
|
|
|
free(A);
|
|
free(B);
|
|
MPI_Wait(&stMpiRequest[0], &stMpiStatus);
|
|
MPI_Wait(&stMpiRequest[1], &stMpiStatus);
|
|
free(C);
|
|
}
|
|
#if (ENABLE_TIME_MEASURE)
|
|
elapsed_time = timer_stop(1);
|
|
|
|
printf("%f sec, Collect Complete! Rank %d\n", elapsed_time, _mpi_rank);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
if (_mpi_rank == 0)
|
|
mat_mul_omp();
|
|
}
|
|
}
|