212 lines
7.2 KiB
C++
212 lines
7.2 KiB
C++
|
#include "mat_mul.h"
|
||
|
|
||
|
#include <cstdlib>
|
||
|
#include <cstdio>
|
||
|
#include <pthread.h>
|
||
|
|
||
|
static float *A, *B, *C;
|
||
|
static int M, N, K;
|
||
|
static int num_threads;
|
||
|
//static int num_threads_my;
|
||
|
|
||
|
/*
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
// ORIGINAL CODE
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
// TODO: parallelize & optimize matrix multiplication
|
||
|
for (int i = 0; i < M; ++i) {
|
||
|
for (int j = 0; j < N; ++j) {
|
||
|
for (int k = 0; k < K; ++k) {
|
||
|
C[i * N + j] += A[i * K + k] * B[k * N + j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
|
||
|
/*
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
// 1. Use multiple Threading
|
||
|
// About 30 sec for one multiplication.
|
||
|
// Run time: 30.xx secs --> STOPPED BY LIMITED RUN TIME
|
||
|
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
// TODO: parallelize & optimize matrix multiplication
|
||
|
// Divide-and-Conquer with threading: Divide jobs by num_threads
|
||
|
|
||
|
int pid = * (int *) data; // pthread ID
|
||
|
|
||
|
int i_slice = M / num_threads; // separate jobs by num_threads
|
||
|
int i_start = pid * i_slice; // divide by row of A(i) as starting row
|
||
|
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
|
||
|
|
||
|
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
|
||
|
for(int j=0 ; j < N; j++) {
|
||
|
for(int k =0; k < K; k++) {
|
||
|
C[ i*N + j] += A [i* K + k] * B[ k * N + j];
|
||
|
} // for k
|
||
|
} // for j
|
||
|
} // for i
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
|
||
|
/*
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
// 1. Use Multiple Threading
|
||
|
// 2. Tiling for column: bs(block size). bs selection affects its performance
|
||
|
// Run time: 4.xx sec, around 30 GFLOPS
|
||
|
|
||
|
//#define MIN(x,y) ((x) <= (y) ? (x) : (y))
|
||
|
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
// TODO: parallelize & optimize matrix multiplication
|
||
|
// Divide-and-Conquer with threading: Divide jobs by num_threads
|
||
|
// Tiling for kk (column of A or Row of B)
|
||
|
|
||
|
int pid = * (int *) data; // pthread ID
|
||
|
|
||
|
int i_slice = M / num_threads; // separate jobs by num_threads
|
||
|
int i_start = pid * i_slice; // divide by row of A(i) as starting row
|
||
|
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
|
||
|
|
||
|
int idx_iNj;
|
||
|
int idx_iKkk;
|
||
|
|
||
|
int bs = 32; // BLOCKSIZE: 32 --> around 30 GFLOPS, run time: 4.53 sec.(avg.)
|
||
|
//int bs = 64; // BLOCKSIZE: 64 --> around 26 GFLOPS, run time: 5.23 sec.(avg.)
|
||
|
//int bs = 96; // BLOCKSIZE: 96 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)
|
||
|
//int bs = 128; // BLOCKSIZE: 128 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)
|
||
|
|
||
|
int min_kk;
|
||
|
float sum;
|
||
|
|
||
|
for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B
|
||
|
min_kk = ((kk+bs) <= K) ? (kk+bs) : K;
|
||
|
|
||
|
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
|
||
|
for(int j=0 ; j < N; j++) {
|
||
|
idx_iKkk = i * K + kk;
|
||
|
idx_iNj = i * N + j;
|
||
|
sum = 0;
|
||
|
//for(int k = kk; k < min_kk; k++) {
|
||
|
for(int k = kk; k < MIN(kk+bs, K); k++) {
|
||
|
sum += A [idx_iKkk++] * B[ k * N + j];
|
||
|
//C[ idx_iNj] += A [idx_iKkk++] * B[ k * N + j];
|
||
|
//C[ i*N + j] += A [i * K + k] * B[ k * N + j];
|
||
|
}
|
||
|
C[idx_iNj] = sum;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
|
||
|
}
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
*/
|
||
|
|
||
|
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
// 1. Use Multiple Threading
|
||
|
// 2. Tiling for column of A or row of B: bs(block size). bs selection affects its performance
|
||
|
// 3. To use locality, change the order of (loop j) and (loop k),
|
||
|
// since B[k][j] for j=0,1,2,... is better than B[k][j] for k=0,1,2, ...
|
||
|
// (Occurred address jumping in every iteration, less data locality)
|
||
|
// ==> 1D locality is larger than 2D locality for matrix multiplication in this case.
|
||
|
// After trials for 2D tiling, as a result, 1D tiling has driven better performance than 2D tiling (around 150 ~ 170 GFLOPS)
|
||
|
|
||
|
//#define MIN(x,y) ((x) <= (y) ? (x) : (y))
|
||
|
|
||
|
static void* mat_mul_thread(void *data) {
|
||
|
// TODO: parallelize & optimize matrix multiplication
|
||
|
// Divide-and-Conquer with threading: Divide jobs by num_threads
|
||
|
// Tiling for kk (column of A or Row of B)
|
||
|
// Use data locality: Cange the order of (loop j) and (loop k),
|
||
|
// since B[k][j] for j=0,1,2,... is better than
|
||
|
// B[k][j] for k=0,1,2, ... (Occurred address jumping in every iteration, less data locality)
|
||
|
|
||
|
int pid = * (int *) data; // pthread ID or index ( 0 ~ num_threads-1)
|
||
|
|
||
|
int i_slice = M / num_threads; // separate jobs by num_threads
|
||
|
int i_start = pid * i_slice; // divide by row of A[i][] as starting row
|
||
|
int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)
|
||
|
|
||
|
float Aik;
|
||
|
// int idx_iK;
|
||
|
// int idx_iN;
|
||
|
// int idx_kN;
|
||
|
|
||
|
//int bs = 16; // BLOCKSIZE: 16 --> around 200 GFLOPS, run time: 0.68 sec.(avg.)
|
||
|
int bs = 32; // BLOCKSIZE: 32 --> around 297 GFLOPS, run time: 0.46 sec.(avg.)
|
||
|
//int bs = 64; // BLOCKSIZE: 64 --> around 256 GFLOPS, run time: 0.54 sec.(avg.)
|
||
|
//int bs = 96; // BLOCKSIZE: 96 --> around 208 GFLOPS, run time: 0.66 sec.(avg.)
|
||
|
//int bs = 128; // BLOCKSIZE: 128 --> around 203 GFLOPS, run time: 0.67 sec.(avg.)
|
||
|
|
||
|
int min_kk;
|
||
|
|
||
|
for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B
|
||
|
min_kk = ((kk+bs) <= K) ? (kk+bs) : K;
|
||
|
|
||
|
for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation
|
||
|
|
||
|
//idx_iK = i * K; // No significant effect
|
||
|
//idx_iN = i * N; // No significant effect
|
||
|
//for(int k = kk; k < MIN(kk+bs, K); k++) {
|
||
|
for(int k = kk; k < min_kk; k++) {
|
||
|
Aik = A[i*K + k]; // Reduce iterative operation
|
||
|
|
||
|
//idx_kN = k * N; // No significant effect
|
||
|
|
||
|
for(int j=0; j < N; j++) {
|
||
|
C[ i*N + j] += Aik * B[ k * N + j]; // slightly increase the performance
|
||
|
|
||
|
//C[ i*N + j] += A[i*K + k] * B[ k * N + j];
|
||
|
//C[ idx_iN + j] += Aik * B[ idx_kN + j]; // No significant effect
|
||
|
|
||
|
} // for j
|
||
|
} // for k
|
||
|
} // for i
|
||
|
} // for kk
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
//----------------------------------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
|
||
|
A = _A, B = _B, C = _C;
|
||
|
M = _M, N = _N, K = _K;
|
||
|
num_threads = _num_threads;
|
||
|
|
||
|
// TODO: create '_num_threads' pthreads
|
||
|
// Divide-and-Conquer: USE Multiple threads
|
||
|
|
||
|
// pthread_t thread;
|
||
|
// pthread_create(&thread, NULL, mat_mul_thread, NULL);
|
||
|
// pthread_join(thread, NULL);
|
||
|
|
||
|
|
||
|
pthread_t * threads;
|
||
|
threads = (pthread_t *) malloc(sizeof(pthread_t) * num_threads);
|
||
|
|
||
|
for (int i = 0; i < num_threads; i++) {
|
||
|
int * pid = (int *) malloc(sizeof(int));
|
||
|
*pid = i;
|
||
|
pthread_create(&threads[i], NULL, mat_mul_thread, pid); // Send pid index as arguments for each thread
|
||
|
}
|
||
|
|
||
|
for (int i = 0; i < num_threads; i++) {
|
||
|
pthread_join(threads[i], NULL);
|
||
|
}
|
||
|
|
||
|
}
|