chundoong-lab-ta/SamsungDS22/submissions/HW2/bumhee86.lee/mat_mul.cpp

#include "mat_mul.h"

#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <immintrin.h>

// __restrict directive means that no memory aliasing on those pointer variables, not much impressive effects
static float *__restrict A, * __restrict B, * __restrict C;
static int M, N, K;
static int num_threads;

// Debug
#define USING_ONE_THREAD (0)

#define VECTOR_SIZE (128)
#define BLOCKSIZE (48)
#define JBLOCKSIZE (VECTOR_SIZE * 8)
#define OPTIMAL_BLOCKSIZE (32)
#define OPTIMAL_MATRIX_SIZE (4096)
#define OPTIMAL_THREAD_CNT (20)
#define MIN(__A,__B) ((__A) < (__B) ? (__A) : (__B)) // Can std::min goes to inline function??

static void* mat_mul_thread_Opt(void* data) {
    int pid = *(int*)data;
    int slice = OPTIMAL_MATRIX_SIZE / OPTIMAL_THREAD_CNT;
    int start = pid * slice;
    int end = pid == (OPTIMAL_THREAD_CNT - 1) ? OPTIMAL_MATRIX_SIZE : (pid + 1) * slice;

    for (int kk = 0; kk < OPTIMAL_MATRIX_SIZE; kk += BLOCKSIZE)
    {
        for (int jj = 0; jj < OPTIMAL_MATRIX_SIZE; jj += JBLOCKSIZE)
        {
            // Prefetch, not much impressive effects
            _mm_prefetch((const char*)&A[start * OPTIMAL_MATRIX_SIZE + kk], _MM_HINT_T0);
            for (int i = start; i < end; ++i)
            {
#if 1
                // Prefetch, not much impressive effects
                 _mm_prefetch((const char*)&C[i * OPTIMAL_MATRIX_SIZE + jj], _MM_HINT_T0);
                 _mm_prefetch((const char*)&B[kk * OPTIMAL_MATRIX_SIZE + jj], _MM_HINT_T0);
                for (int k = kk; k < MIN(kk + BLOCKSIZE, OPTIMAL_MATRIX_SIZE); ++k)
                {
                    // GCC merges aik variables, meaningless but tried
                    __m512 aik, aik2, aik3, aik4;
                    // Fill A values to aik (32 bit per float * 16EA = 512 bit (__m512))
                    aik4 = aik3 = aik2 = aik = _mm512_set1_ps(A[i * OPTIMAL_MATRIX_SIZE + k]);

                    // 4096 % 128 == 0, don't need to use MIN macro
                    for (int j = jj; j < jj + JBLOCKSIZE; j += VECTOR_SIZE)
                    {
                        __m512 c1, c2, c3, c4, c5, c6, c7, c8;

                        // Load C matrix variables
                        c1 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j]);
                        c2 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x10]);
                        c3 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x20]);
                        c4 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x30]);
                        c5 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x40]);
                        c6 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x50]);
                        c7 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x60]);
                        c8 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x70]);

                        // FMA Local C = B * A + C
                        c1 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j]), aik, c1);
                        c2 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x10]), aik2, c2);
                        c3 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x20]), aik3, c3);
                        c4 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x30]), aik4, c4);
                        c5 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x40]), aik, c5);
                        c6 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x50]), aik2, c6);
                        c7 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x60]), aik3, c7);
                        c8 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x70]), aik4, c8);

                        // Store C matrix values
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j], c1);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x10], c2);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x20], c3);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x30], c4);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x40], c5);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x50], c6);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x60], c7);
                        _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x70], c8);
                    }
                }
#else
                for (int k = kk; k < kk + BLOCKSIZE; ++k)
                {
                    float Aik, Aik2;
                    Aik2 = Aik = A[i * K + k];
                    for (int j = 0; j < N; j = j + 4)
                    {
                        C[i * N + j] += Aik * B[k * N + j];
                        C[i * N + j + 1] += Aik2 * B[k * N + j + 1];
                        C[i * N + j + 2] += Aik * B[k * N + j + 2];
                        C[i * N + j + 3] += Aik2 * B[k * N + j + 3];
                    }
                }
#endif
            }
        }
    }

    return NULL;
}

static void* mat_mul_thread(void *data) {
    int pid = *(int*)data;

    int slice = M / num_threads;
    int start = pid * slice;
    int end = pid == num_threads - 1 ? M : (pid + 1) * slice;

    float Aik;
    int bs = BLOCKSIZE;

    for (int kk = 0; kk < K; kk += bs) {
             {
                for (int i = start; i < end; ++i)
                {
                    for (int k = kk; k < MIN(kk + bs, K); ++k) {
                        Aik = A[i * K + k];
                        for (int j = 0; j < N; ++j) {
                            C[i * N + j] += Aik * B[k * N + j];
                        }
                    }
                }
            }
    }

    return NULL;
}

void mat_mul(float * __restrict _A, float * __restrict _B, float * __restrict _C, int _M, int _N, int _K, int _num_threads) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads;

  if (M == OPTIMAL_MATRIX_SIZE
      && N == OPTIMAL_MATRIX_SIZE
      && K == OPTIMAL_MATRIX_SIZE
      && num_threads == OPTIMAL_THREAD_CNT)
  {
      // Performance checking condition, can reduce unroll cost, constant folding ...
      pthread_t threads[OPTIMAL_THREAD_CNT];
      int pid[OPTIMAL_THREAD_CNT];

#if (USING_ONE_THREAD)
    _num_threads = 1;
#endif // USING_ONE_THREAD

      for (int i = 0; i < _num_threads; ++i) {
          pid[i] = i;
          pthread_create(&threads[i], NULL, mat_mul_thread_Opt, &pid[i]);
      }

      for (int i = 0; i < _num_threads; ++i) {
          pthread_join(threads[i], NULL);
      }
  }
  else
  {
      pthread_t* threads;
      threads = (pthread_t*)malloc(sizeof(pthread_t) * num_threads);

      for (int i = 0; i < num_threads; i++) {
          int* pid = (int*)malloc(sizeof(int));
          *pid = i;
          pthread_create(&threads[i], NULL, mat_mul_thread, pid);
      }

      for (int i = 0; i < num_threads; i++) {
          pthread_join(threads[i], NULL);
      }
  }
}