chundoong-lab-ta/SamsungDS22/submissions/HW2/dk2003.lim/mat_mul.cpp

212 lines
7.2 KiB
C++

#include "mat_mul.h"
#include <cstdlib>
#include <cstdio>
#include <pthread.h>
static float *A, *B, *C;
static int M, N, K;
static int num_threads;
//static int num_threads_my;
/*
//----------------------------------------------------------------------------------------------
// ORIGINAL CODE
static void* mat_mul_thread(void *data) {
// TODO: parallelize & optimize matrix multiplication
for (int i = 0; i < M; ++i) {
for (int j = 0; j < N; ++j) {
for (int k = 0; k < K; ++k) {
C[i * N + j] += A[i * K + k] * B[k * N + j];
}
}
}
return NULL;
}
//----------------------------------------------------------------------------------------------
*/
/*
//----------------------------------------------------------------------------------------------
// 1. Use multiple Threading
// About 30 sec for one multiplication.
// Run time: 30.xx secs --> STOPPED BY LIMITED RUN TIME
static void* mat_mul_thread(void *data) {
// TODO: parallelize & optimize matrix multiplication
// Divide-and-Conquer with threading: Divide jobs by num_threads
int pid = * (int *) data; // pthread ID
int i_slice = M / num_threads; // separate jobs by num_threads
int i_start = pid * i_slice; // divide by row of A(i) as starting row
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
for(int j=0 ; j < N; j++) {
for(int k =0; k < K; k++) {
C[ i*N + j] += A [i* K + k] * B[ k * N + j];
} // for k
} // for j
} // for i
return NULL;
}
//----------------------------------------------------------------------------------------------
*/
/*
//----------------------------------------------------------------------------------------------
// 1. Use Multiple Threading
// 2. Tiling for column: bs(block size). bs selection affects its performance
// Run time: 4.xx sec, around 30 GFLOPS
//#define MIN(x,y) ((x) <= (y) ? (x) : (y))
static void* mat_mul_thread(void *data) {
// TODO: parallelize & optimize matrix multiplication
// Divide-and-Conquer with threading: Divide jobs by num_threads
// Tiling for kk (column of A or Row of B)
int pid = * (int *) data; // pthread ID
int i_slice = M / num_threads; // separate jobs by num_threads
int i_start = pid * i_slice; // divide by row of A(i) as starting row
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
int idx_iNj;
int idx_iKkk;
int bs = 32; // BLOCKSIZE: 32 --> around 30 GFLOPS, run time: 4.53 sec.(avg.)
//int bs = 64; // BLOCKSIZE: 64 --> around 26 GFLOPS, run time: 5.23 sec.(avg.)
//int bs = 96; // BLOCKSIZE: 96 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)
//int bs = 128; // BLOCKSIZE: 128 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)
int min_kk;
float sum;
for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B
min_kk = ((kk+bs) <= K) ? (kk+bs) : K;
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
for(int j=0 ; j < N; j++) {
idx_iKkk = i * K + kk;
idx_iNj = i * N + j;
sum = 0;
//for(int k = kk; k < min_kk; k++) {
for(int k = kk; k < MIN(kk+bs, K); k++) {
sum += A [idx_iKkk++] * B[ k * N + j];
//C[ idx_iNj] += A [idx_iKkk++] * B[ k * N + j];
//C[ i*N + j] += A [i * K + k] * B[ k * N + j];
}
C[idx_iNj] = sum;
}
}
}
return NULL;
}
//----------------------------------------------------------------------------------------------
*/
//----------------------------------------------------------------------------------------------
// 1. Use Multiple Threading
// 2. Tiling for column of A or row of B: bs(block size). bs selection affects its performance
// 3. To use locality, change the order of (loop j) and (loop k),
// since B[k][j] for j=0,1,2,... is better than B[k][j] for k=0,1,2, ...
// (Occurred address jumping in every iteration, less data locality)
// ==> 1D locality is larger than 2D locality for matrix multiplication in this case.
// After trials for 2D tiling, as a result, 1D tiling has driven better performance than 2D tiling (around 150 ~ 170 GFLOPS)
//#define MIN(x,y) ((x) <= (y) ? (x) : (y))
static void* mat_mul_thread(void *data) {
// TODO: parallelize & optimize matrix multiplication
// Divide-and-Conquer with threading: Divide jobs by num_threads
// Tiling for kk (column of A or Row of B)
// Use data locality: Cange the order of (loop j) and (loop k),
// since B[k][j] for j=0,1,2,... is better than
// B[k][j] for k=0,1,2, ... (Occurred address jumping in every iteration, less data locality)
int pid = * (int *) data; // pthread ID or index ( 0 ~ num_threads-1)
int i_slice = M / num_threads; // separate jobs by num_threads
int i_start = pid * i_slice; // divide by row of A[i][] as starting row
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
float Aik;
// int idx_iK;
// int idx_iN;
// int idx_kN;
//int bs = 16; // BLOCKSIZE: 16 --> around 200 GFLOPS, run time: 0.68 sec.(avg.)
int bs = 32; // BLOCKSIZE: 32 --> around 297 GFLOPS, run time: 0.46 sec.(avg.)
//int bs = 64; // BLOCKSIZE: 64 --> around 256 GFLOPS, run time: 0.54 sec.(avg.)
//int bs = 96; // BLOCKSIZE: 96 --> around 208 GFLOPS, run time: 0.66 sec.(avg.)
//int bs = 128; // BLOCKSIZE: 128 --> around 203 GFLOPS, run time: 0.67 sec.(avg.)
int min_kk;
for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B
min_kk = ((kk+bs) <= K) ? (kk+bs) : K;
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
//idx_iK = i * K; // No significant effect
//idx_iN = i * N; // No significant effect
//for(int k = kk; k < MIN(kk+bs, K); k++) {
for(int k = kk; k < min_kk; k++) {
Aik = A[i*K + k]; // Reduce iterative operation
//idx_kN = k * N; // No significant effect
for(int j=0; j < N; j++) {
C[ i*N + j] += Aik * B[ k * N + j]; // slightly increase the performance
//C[ i*N + j] += A[i*K + k] * B[ k * N + j];
//C[ idx_iN + j] += Aik * B[ idx_kN + j]; // No significant effect
} // for j
} // for k
} // for i
} // for kk
return NULL;
}
//----------------------------------------------------------------------------------------------
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
A = _A, B = _B, C = _C;
M = _M, N = _N, K = _K;
num_threads = _num_threads;
// TODO: create '_num_threads' pthreads
// Divide-and-Conquer: USE Multiple threads
// pthread_t thread;
// pthread_create(&thread, NULL, mat_mul_thread, NULL);
// pthread_join(thread, NULL);
pthread_t * threads;
threads = (pthread_t *) malloc(sizeof(pthread_t) * num_threads);
for (int i = 0; i < num_threads; i++) {
int * pid = (int *) malloc(sizeof(int));
*pid = i;
pthread_create(&threads[i], NULL, mat_mul_thread, pid); // Send pid index as arguments for each thread
}
for (int i = 0; i < num_threads; i++) {
pthread_join(threads[i], NULL);
}
}