#include "mat_mul.h" #include #include #include #include // __restrict directive means that no memory aliasing on those pointer variables, not much impressive effects static float *__restrict A, * __restrict B, * __restrict C; static int M, N, K; static int num_threads; // Debug #define USING_ONE_THREAD (0) #define VECTOR_SIZE (128) #define BLOCKSIZE (48) #define JBLOCKSIZE (VECTOR_SIZE * 8) #define OPTIMAL_BLOCKSIZE (32) #define OPTIMAL_MATRIX_SIZE (4096) #define OPTIMAL_THREAD_CNT (20) #define MIN(__A,__B) ((__A) < (__B) ? (__A) : (__B)) // Can std::min goes to inline function?? static void* mat_mul_thread_Opt(void* data) { int pid = *(int*)data; int slice = OPTIMAL_MATRIX_SIZE / OPTIMAL_THREAD_CNT; int start = pid * slice; int end = pid == (OPTIMAL_THREAD_CNT - 1) ? OPTIMAL_MATRIX_SIZE : (pid + 1) * slice; for (int kk = 0; kk < OPTIMAL_MATRIX_SIZE; kk += BLOCKSIZE) { for (int jj = 0; jj < OPTIMAL_MATRIX_SIZE; jj += JBLOCKSIZE) { // Prefetch, not much impressive effects _mm_prefetch((const char*)&A[start * OPTIMAL_MATRIX_SIZE + kk], _MM_HINT_T0); for (int i = start; i < end; ++i) { #if 1 // Prefetch, not much impressive effects _mm_prefetch((const char*)&C[i * OPTIMAL_MATRIX_SIZE + jj], _MM_HINT_T0); _mm_prefetch((const char*)&B[kk * OPTIMAL_MATRIX_SIZE + jj], _MM_HINT_T0); for (int k = kk; k < MIN(kk + BLOCKSIZE, OPTIMAL_MATRIX_SIZE); ++k) { // GCC merges aik variables, meaningless but tried __m512 aik, aik2, aik3, aik4; // Fill A values to aik (32 bit per float * 16EA = 512 bit (__m512)) aik4 = aik3 = aik2 = aik = _mm512_set1_ps(A[i * OPTIMAL_MATRIX_SIZE + k]); // 4096 % 128 == 0, don't need to use MIN macro for (int j = jj; j < jj + JBLOCKSIZE; j += VECTOR_SIZE) { __m512 c1, c2, c3, c4, c5, c6, c7, c8; // Load C matrix variables c1 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j]); c2 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x10]); c3 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x20]); c4 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x30]); c5 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x40]); c6 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x50]); c7 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x60]); c8 = _mm512_load_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x70]); // FMA Local C = B * A + C c1 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j]), aik, c1); c2 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x10]), aik2, c2); c3 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x20]), aik3, c3); c4 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x30]), aik4, c4); c5 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x40]), aik, c5); c6 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x50]), aik2, c6); c7 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x60]), aik3, c7); c8 = _mm512_fmadd_ps(_mm512_load_ps(&B[k * OPTIMAL_MATRIX_SIZE + j + 0x70]), aik4, c8); // Store C matrix values _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j], c1); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x10], c2); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x20], c3); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x30], c4); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x40], c5); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x50], c6); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x60], c7); _mm512_store_ps(&C[i * OPTIMAL_MATRIX_SIZE + j + 0x70], c8); } } #else for (int k = kk; k < kk + BLOCKSIZE; ++k) { float Aik, Aik2; Aik2 = Aik = A[i * K + k]; for (int j = 0; j < N; j = j + 4) { C[i * N + j] += Aik * B[k * N + j]; C[i * N + j + 1] += Aik2 * B[k * N + j + 1]; C[i * N + j + 2] += Aik * B[k * N + j + 2]; C[i * N + j + 3] += Aik2 * B[k * N + j + 3]; } } #endif } } } return NULL; } static void* mat_mul_thread(void *data) { int pid = *(int*)data; int slice = M / num_threads; int start = pid * slice; int end = pid == num_threads - 1 ? M : (pid + 1) * slice; float Aik; int bs = BLOCKSIZE; for (int kk = 0; kk < K; kk += bs) { { for (int i = start; i < end; ++i) { for (int k = kk; k < MIN(kk + bs, K); ++k) { Aik = A[i * K + k]; for (int j = 0; j < N; ++j) { C[i * N + j] += Aik * B[k * N + j]; } } } } } return NULL; } void mat_mul(float * __restrict _A, float * __restrict _B, float * __restrict _C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; if (M == OPTIMAL_MATRIX_SIZE && N == OPTIMAL_MATRIX_SIZE && K == OPTIMAL_MATRIX_SIZE && num_threads == OPTIMAL_THREAD_CNT) { // Performance checking condition, can reduce unroll cost, constant folding ... pthread_t threads[OPTIMAL_THREAD_CNT]; int pid[OPTIMAL_THREAD_CNT]; #if (USING_ONE_THREAD) _num_threads = 1; #endif // USING_ONE_THREAD for (int i = 0; i < _num_threads; ++i) { pid[i] = i; pthread_create(&threads[i], NULL, mat_mul_thread_Opt, &pid[i]); } for (int i = 0; i < _num_threads; ++i) { pthread_join(threads[i], NULL); } } else { pthread_t* threads; threads = (pthread_t*)malloc(sizeof(pthread_t) * num_threads); for (int i = 0; i < num_threads; i++) { int* pid = (int*)malloc(sizeof(int)); *pid = i; pthread_create(&threads[i], NULL, mat_mul_thread, pid); } for (int i = 0; i < num_threads; i++) { pthread_join(threads[i], NULL); } } }