#include "mat_mul.h" #include #include #include static float *A, *B, *C; static int M, N, K; static int num_threads; //static int num_threads_my; /* //---------------------------------------------------------------------------------------------- // ORIGINAL CODE static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { for (int k = 0; k < K; ++k) { C[i * N + j] += A[i * K + k] * B[k * N + j]; } } } return NULL; } //---------------------------------------------------------------------------------------------- */ /* //---------------------------------------------------------------------------------------------- // 1. Use multiple Threading // About 30 sec for one multiplication. // Run time: 30.xx secs --> STOPPED BY LIMITED RUN TIME static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication // Divide-and-Conquer with threading: Divide jobs by num_threads int pid = * (int *) data; // pthread ID int i_slice = M / num_threads; // separate jobs by num_threads int i_start = pid * i_slice; // divide by row of A(i) as starting row int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A) for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation for(int j=0 ; j < N; j++) { for(int k =0; k < K; k++) { C[ i*N + j] += A [i* K + k] * B[ k * N + j]; } // for k } // for j } // for i return NULL; } //---------------------------------------------------------------------------------------------- */ /* //---------------------------------------------------------------------------------------------- // 1. Use Multiple Threading // 2. Tiling for column: bs(block size). bs selection affects its performance // Run time: 4.xx sec, around 30 GFLOPS //#define MIN(x,y) ((x) <= (y) ? (x) : (y)) static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication // Divide-and-Conquer with threading: Divide jobs by num_threads // Tiling for kk (column of A or Row of B) int pid = * (int *) data; // pthread ID int i_slice = M / num_threads; // separate jobs by num_threads int i_start = pid * i_slice; // divide by row of A(i) as starting row int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A) int idx_iNj; int idx_iKkk; int bs = 32; // BLOCKSIZE: 32 --> around 30 GFLOPS, run time: 4.53 sec.(avg.) //int bs = 64; // BLOCKSIZE: 64 --> around 26 GFLOPS, run time: 5.23 sec.(avg.) //int bs = 96; // BLOCKSIZE: 96 --> around 24 GFLOPS, run time: 5.71 sec.(avg.) //int bs = 128; // BLOCKSIZE: 128 --> around 24 GFLOPS, run time: 5.71 sec.(avg.) int min_kk; float sum; for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B min_kk = ((kk+bs) <= K) ? (kk+bs) : K; for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation for(int j=0 ; j < N; j++) { idx_iKkk = i * K + kk; idx_iNj = i * N + j; sum = 0; //for(int k = kk; k < min_kk; k++) { for(int k = kk; k < MIN(kk+bs, K); k++) { sum += A [idx_iKkk++] * B[ k * N + j]; //C[ idx_iNj] += A [idx_iKkk++] * B[ k * N + j]; //C[ i*N + j] += A [i * K + k] * B[ k * N + j]; } C[idx_iNj] = sum; } } } return NULL; } //---------------------------------------------------------------------------------------------- */ //---------------------------------------------------------------------------------------------- // 1. Use Multiple Threading // 2. Tiling for column of A or row of B: bs(block size). bs selection affects its performance // 3. To use locality, change the order of (loop j) and (loop k), // since B[k][j] for j=0,1,2,... is better than B[k][j] for k=0,1,2, ... // (Occurred address jumping in every iteration, less data locality) // ==> 1D locality is larger than 2D locality for matrix multiplication in this case. // After trials for 2D tiling, as a result, 1D tiling has driven better performance than 2D tiling (around 150 ~ 170 GFLOPS) //#define MIN(x,y) ((x) <= (y) ? (x) : (y)) static void* mat_mul_thread(void *data) { // TODO: parallelize & optimize matrix multiplication // Divide-and-Conquer with threading: Divide jobs by num_threads // Tiling for kk (column of A or Row of B) // Use data locality: Cange the order of (loop j) and (loop k), // since B[k][j] for j=0,1,2,... is better than // B[k][j] for k=0,1,2, ... (Occurred address jumping in every iteration, less data locality) int pid = * (int *) data; // pthread ID or index ( 0 ~ num_threads-1) int i_slice = M / num_threads; // separate jobs by num_threads int i_start = pid * i_slice; // divide by row of A[i][] as starting row int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A) float Aik; // int idx_iK; // int idx_iN; // int idx_kN; //int bs = 16; // BLOCKSIZE: 16 --> around 200 GFLOPS, run time: 0.68 sec.(avg.) int bs = 32; // BLOCKSIZE: 32 --> around 297 GFLOPS, run time: 0.46 sec.(avg.) //int bs = 64; // BLOCKSIZE: 64 --> around 256 GFLOPS, run time: 0.54 sec.(avg.) //int bs = 96; // BLOCKSIZE: 96 --> around 208 GFLOPS, run time: 0.66 sec.(avg.) //int bs = 128; // BLOCKSIZE: 128 --> around 203 GFLOPS, run time: 0.67 sec.(avg.) int min_kk; for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B min_kk = ((kk+bs) <= K) ? (kk+bs) : K; for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation //idx_iK = i * K; // No significant effect //idx_iN = i * N; // No significant effect //for(int k = kk; k < MIN(kk+bs, K); k++) { for(int k = kk; k < min_kk; k++) { Aik = A[i*K + k]; // Reduce iterative operation //idx_kN = k * N; // No significant effect for(int j=0; j < N; j++) { C[ i*N + j] += Aik * B[ k * N + j]; // slightly increase the performance //C[ i*N + j] += A[i*K + k] * B[ k * N + j]; //C[ idx_iN + j] += Aik * B[ idx_kN + j]; // No significant effect } // for j } // for k } // for i } // for kk return NULL; } //---------------------------------------------------------------------------------------------- void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) { A = _A, B = _B, C = _C; M = _M, N = _N, K = _K; num_threads = _num_threads; // TODO: create '_num_threads' pthreads // Divide-and-Conquer: USE Multiple threads // pthread_t thread; // pthread_create(&thread, NULL, mat_mul_thread, NULL); // pthread_join(thread, NULL); pthread_t * threads; threads = (pthread_t *) malloc(sizeof(pthread_t) * num_threads); for (int i = 0; i < num_threads; i++) { int * pid = (int *) malloc(sizeof(int)); *pid = i; pthread_create(&threads[i], NULL, mat_mul_thread, pid); // Send pid index as arguments for each thread } for (int i = 0; i < num_threads; i++) { pthread_join(threads[i], NULL); } }