chundoong-lab-ta/SamsungDS22/submissions/HW2/dk2003.lim/mat_mul.cpp

#include "mat_mul.h"

#include <cstdlib>
#include <cstdio>
#include <pthread.h>

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
//static int num_threads_my;

/*
//----------------------------------------------------------------------------------------------
// ORIGINAL CODE
static void* mat_mul_thread(void *data) {
  // TODO: parallelize & optimize matrix multiplication
  for (int i = 0; i < M; ++i) {
      for (int j = 0; j < N; ++j) {
        for (int k = 0; k < K; ++k) {
          C[i * N + j] += A[i * K + k] * B[k * N + j];
        }
    }
  }

  return NULL;
}
//----------------------------------------------------------------------------------------------
*/


/*
//----------------------------------------------------------------------------------------------
// 1. Use multiple Threading 
//    About 30 sec for one multiplication.
//    Run time: 30.xx secs --> STOPPED BY LIMITED RUN TIME 

static void* mat_mul_thread(void *data) {
  // TODO: parallelize & optimize matrix multiplication
  // Divide-and-Conquer with threading: Divide jobs by num_threads 

  int pid = * (int *) data;		// pthread ID

  int i_slice = M / num_threads;	// separate jobs by num_threads
  int i_start = pid * i_slice;		// divide by row of A(i) as starting row
  int i_end = (pid == num_threads - 1) ? M : i_start + i_slice;		// ending row will be (slice size or M (max row of A)

  for (int i = i_start; i < i_end; i++) {	// Threading by pid and its row location calculation
	for(int j=0 ; j < N; j++) {
	  for(int k =0; k < K; k++) {
		C[ i*N + j] += A [i* K + k] * B[ k * N + j];
	  }	// for k
	} // for j
  } // for i

  return NULL;
}
//----------------------------------------------------------------------------------------------
*/


/*
//----------------------------------------------------------------------------------------------
// 1. Use Multiple Threading 
// 2. Tiling for column: bs(block size). bs selection affects its performance
//    Run time: 4.xx sec, around 30 GFLOPS

//#define MIN(x,y)  ((x) <= (y) ? (x) : (y))

static void* mat_mul_thread(void *data) {
  // TODO: parallelize & optimize matrix multiplication
  // Divide-and-Conquer with threading: Divide jobs by num_threads 
  // Tiling for kk (column of A or Row of B)

  int pid = * (int *) data;		// pthread ID

  int i_slice = M / num_threads;	// separate jobs by num_threads
  int i_start = pid * i_slice;		// divide by row of A(i) as starting row
  int i_end = (pid == num_threads - 1) ? M : i_start + i_slice;		// ending row will be (slice size or M (max row of A)
				  
  int idx_iNj;
  int idx_iKkk;
	
  int bs = 32;	// BLOCKSIZE: 32 --> around 30 GFLOPS, run time: 4.53 sec.(avg.)
  //int bs = 64;	// BLOCKSIZE: 64 --> around 26 GFLOPS, run time: 5.23 sec.(avg.)
  //int bs = 96;	// BLOCKSIZE: 96 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)
  //int bs = 128;	// BLOCKSIZE: 128 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)

  int min_kk;
  float sum;

  for (int kk = 0; kk < K; kk += bs) {	// Tiling according to column of A or Row of B
	min_kk = ((kk+bs) <= K) ? (kk+bs) : K;

    for (int i = i_start; i < i_end; i++) {	// Threading by pid and its row location calculation
		for(int j=0 ; j < N; j++) {
		    idx_iKkk = i * K + kk;
		    idx_iNj = i * N + j;
			sum = 0;
			//for(int k = kk; k < min_kk; k++) {
			for(int k = kk; k < MIN(kk+bs, K); k++) {
				sum += A [idx_iKkk++] * B[ k * N + j];
				//C[ idx_iNj] += A [idx_iKkk++] * B[ k * N + j];
				//C[ i*N + j] += A [i * K + k] * B[ k * N + j];
			}
		 C[idx_iNj] = sum;
		}
    }
  }

  return NULL;
  
}
//----------------------------------------------------------------------------------------------
*/


//----------------------------------------------------------------------------------------------
// 1. Use Multiple Threading 
// 2. Tiling for column of A or row of B: bs(block size). bs selection affects its performance
// 3. To use locality, change the order of (loop j) and (loop k), 
//     since B[k][j] for j=0,1,2,... is better than B[k][j] for k=0,1,2, ... 
//     (Occurred address jumping in every iteration, less data locality)
//     ==> 1D locality is larger than 2D locality for matrix multiplication in this case.
//         After trials for 2D tiling, as a result, 1D tiling has driven better performance than 2D tiling (around 150 ~ 170 GFLOPS)

//#define MIN(x,y)  ((x) <= (y) ? (x) : (y))

static void* mat_mul_thread(void *data) {
  // TODO: parallelize & optimize matrix multiplication
  // Divide-and-Conquer with threading: Divide jobs by num_threads 
  // Tiling for kk (column of A or Row of B)
  // Use data locality: Cange the order of (loop j) and (loop k), 
  //		                since B[k][j] for j=0,1,2,... is better than
  //							  B[k][j] for k=0,1,2, ... (Occurred address jumping in every iteration, less data locality)

  int pid = * (int *) data;		// pthread ID or index ( 0 ~ num_threads-1)

  int i_slice = M / num_threads;	// separate jobs by num_threads
  int i_start = pid * i_slice;		// divide by row of A[i][] as starting row
  int i_end = (pid == num_threads - 1) ? M : i_start + i_slice;		// ending row will be (slice size or M (max row of A)
				  
  float Aik;
  // int idx_iK;
  // int idx_iN;
  // int idx_kN;
	
  //int bs = 16;	// BLOCKSIZE: 16 --> around 200 GFLOPS, run time: 0.68 sec.(avg.)
  int bs = 32;	// BLOCKSIZE: 32 --> around 297 GFLOPS, run time: 0.46 sec.(avg.)
  //int bs = 64;	// BLOCKSIZE: 64 --> around 256 GFLOPS, run time: 0.54 sec.(avg.)
  //int bs = 96;	// BLOCKSIZE: 96 --> around 208 GFLOPS, run time: 0.66 sec.(avg.)
  //int bs = 128;	// BLOCKSIZE: 128 --> around 203 GFLOPS, run time: 0.67 sec.(avg.)

  int min_kk;

  for (int kk = 0; kk < K; kk += bs) {	// Tiling according to column of A or Row of B
	min_kk = ((kk+bs) <= K) ? (kk+bs) : K;

    for (int i = i_start; i < i_end; i++) {	// Threading by pid and its row location calculation

	    //idx_iK = i * K;			// No significant effect 
	    //idx_iN = i * N;			// No significant effect 
		//for(int k = kk; k < MIN(kk+bs, K); k++) {
		for(int k = kk; k < min_kk; k++) {
			Aik = A[i*K + k];			// Reduce iterative operation

			//idx_kN = k * N;			// No significant effect 

			for(int j=0; j < N; j++) {
				C[ i*N + j] += Aik * B[ k * N + j];			// slightly increase the performance

				//C[ i*N + j] += A[i*K + k] * B[ k * N + j];
				//C[ idx_iN + j] += Aik * B[ idx_kN + j];			// No significant effect 

			}	// for j
		}	// for k
	  }	 // for i
  }	// for kk

  return NULL;
}

//----------------------------------------------------------------------------------------------


void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads;

  // TODO: create '_num_threads' pthreads
  // Divide-and-Conquer: USE Multiple threads

//  pthread_t thread;
//  pthread_create(&thread, NULL, mat_mul_thread, NULL);
//  pthread_join(thread, NULL);


  pthread_t * threads;
  threads = (pthread_t *) malloc(sizeof(pthread_t) * num_threads);

  for (int i = 0; i < num_threads; i++) {
	int * pid = (int *) malloc(sizeof(int));
	*pid = i;
	pthread_create(&threads[i], NULL, mat_mul_thread, pid);		// Send pid index as arguments for each thread
  }

  for (int i = 0; i < num_threads; i++) {
	pthread_join(threads[i], NULL);
  }

}
. 2022-09-29 18:01:45 +09:00			`#include "mat_mul.h"`

			`#include <cstdlib>`
			`#include <cstdio>`
			`#include <pthread.h>`

			`static float A, B, *C;`
			`static int M, N, K;`
			`static int num_threads;`
			`//static int num_threads_my;`

			`/*`
			`//----------------------------------------------------------------------------------------------`
			`// ORIGINAL CODE`
			`static void* mat_mul_thread(void *data) {`
			`// TODO: parallelize & optimize matrix multiplication`
			`for (int i = 0; i < M; ++i) {`
			`for (int j = 0; j < N; ++j) {`
			`for (int k = 0; k < K; ++k) {`
			`C[i * N + j] += A[i * K + k] * B[k * N + j];`
			`}`
			`}`
			`}`

			`return NULL;`
			`}`
			`//----------------------------------------------------------------------------------------------`
			`*/`


			`/*`
			`//----------------------------------------------------------------------------------------------`
			`// 1. Use multiple Threading`
			`// About 30 sec for one multiplication.`
			`// Run time: 30.xx secs --> STOPPED BY LIMITED RUN TIME`

			`static void* mat_mul_thread(void *data) {`
			`// TODO: parallelize & optimize matrix multiplication`
			`// Divide-and-Conquer with threading: Divide jobs by num_threads`

			`int pid = * (int *) data; // pthread ID`

			`int i_slice = M / num_threads; // separate jobs by num_threads`
			`int i_start = pid * i_slice; // divide by row of A(i) as starting row`
			`int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)`

			`for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation`
			`for(int j=0 ; j < N; j++) {`
			`for(int k =0; k < K; k++) {`
			`C[ iN + j] += A [i K + k] * B[ k * N + j];`
			`} // for k`
			`} // for j`
			`} // for i`

			`return NULL;`
			`}`
			`//----------------------------------------------------------------------------------------------`
			`*/`


			`/*`
			`//----------------------------------------------------------------------------------------------`
			`// 1. Use Multiple Threading`
			`// 2. Tiling for column: bs(block size). bs selection affects its performance`
			`// Run time: 4.xx sec, around 30 GFLOPS`

			`//#define MIN(x,y) ((x) <= (y) ? (x) : (y))`

			`static void* mat_mul_thread(void *data) {`
			`// TODO: parallelize & optimize matrix multiplication`
			`// Divide-and-Conquer with threading: Divide jobs by num_threads`
			`// Tiling for kk (column of A or Row of B)`

			`int pid = * (int *) data; // pthread ID`

			`int i_slice = M / num_threads; // separate jobs by num_threads`
			`int i_start = pid * i_slice; // divide by row of A(i) as starting row`
			`int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)`

			`int idx_iNj;`
			`int idx_iKkk;`

			`int bs = 32; // BLOCKSIZE: 32 --> around 30 GFLOPS, run time: 4.53 sec.(avg.)`
			`//int bs = 64; // BLOCKSIZE: 64 --> around 26 GFLOPS, run time: 5.23 sec.(avg.)`
			`//int bs = 96; // BLOCKSIZE: 96 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)`
			`//int bs = 128; // BLOCKSIZE: 128 --> around 24 GFLOPS, run time: 5.71 sec.(avg.)`

			`int min_kk;`
			`float sum;`

			`for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B`
			`min_kk = ((kk+bs) <= K) ? (kk+bs) : K;`

			`for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation`
			`for(int j=0 ; j < N; j++) {`
			`idx_iKkk = i * K + kk;`
			`idx_iNj = i * N + j;`
			`sum = 0;`
			`//for(int k = kk; k < min_kk; k++) {`
			`for(int k = kk; k < MIN(kk+bs, K); k++) {`
			`sum += A [idx_iKkk++] * B[ k * N + j];`
			`//C[ idx_iNj] += A [idx_iKkk++] * B[ k * N + j];`
			`//C[ iN + j] += A [i K + k] * B[ k * N + j];`
			`}`
			`C[idx_iNj] = sum;`
			`}`
			`}`
			`}`

			`return NULL;`

			`}`
			`//----------------------------------------------------------------------------------------------`
			`*/`


			`//----------------------------------------------------------------------------------------------`
			`// 1. Use Multiple Threading`
			`// 2. Tiling for column of A or row of B: bs(block size). bs selection affects its performance`
			`// 3. To use locality, change the order of (loop j) and (loop k),`
			`// since B[k][j] for j=0,1,2,... is better than B[k][j] for k=0,1,2, ...`
			`// (Occurred address jumping in every iteration, less data locality)`
			`// ==> 1D locality is larger than 2D locality for matrix multiplication in this case.`
			`// After trials for 2D tiling, as a result, 1D tiling has driven better performance than 2D tiling (around 150 ~ 170 GFLOPS)`

			`//#define MIN(x,y) ((x) <= (y) ? (x) : (y))`

			`static void* mat_mul_thread(void *data) {`
			`// TODO: parallelize & optimize matrix multiplication`
			`// Divide-and-Conquer with threading: Divide jobs by num_threads`
			`// Tiling for kk (column of A or Row of B)`
			`// Use data locality: Cange the order of (loop j) and (loop k),`
			`// since B[k][j] for j=0,1,2,... is better than`
			`// B[k][j] for k=0,1,2, ... (Occurred address jumping in every iteration, less data locality)`

			`int pid = * (int *) data; // pthread ID or index ( 0 ~ num_threads-1)`

			`int i_slice = M / num_threads; // separate jobs by num_threads`
			`int i_start = pid * i_slice; // divide by row of A[i][] as starting row`
			`int i_end = (pid == num_threads - 1) ? M : i_start + i_slice; // ending row will be (slice size or M (max row of A)`

			`float Aik;`
			`// int idx_iK;`
			`// int idx_iN;`
			`// int idx_kN;`

			`//int bs = 16; // BLOCKSIZE: 16 --> around 200 GFLOPS, run time: 0.68 sec.(avg.)`
			`int bs = 32; // BLOCKSIZE: 32 --> around 297 GFLOPS, run time: 0.46 sec.(avg.)`
			`//int bs = 64; // BLOCKSIZE: 64 --> around 256 GFLOPS, run time: 0.54 sec.(avg.)`
			`//int bs = 96; // BLOCKSIZE: 96 --> around 208 GFLOPS, run time: 0.66 sec.(avg.)`
			`//int bs = 128; // BLOCKSIZE: 128 --> around 203 GFLOPS, run time: 0.67 sec.(avg.)`

			`int min_kk;`

			`for (int kk = 0; kk < K; kk += bs) { // Tiling according to column of A or Row of B`
			`min_kk = ((kk+bs) <= K) ? (kk+bs) : K;`

			`for (int i = i_start; i < i_end; i++) { // Threading by pid and its row location calculation`

			`//idx_iK = i * K; // No significant effect`
			`//idx_iN = i * N; // No significant effect`
			`//for(int k = kk; k < MIN(kk+bs, K); k++) {`
			`for(int k = kk; k < min_kk; k++) {`
			`Aik = A[i*K + k]; // Reduce iterative operation`

			`//idx_kN = k * N; // No significant effect`

			`for(int j=0; j < N; j++) {`
			`C[ iN + j] += Aik B[ k * N + j]; // slightly increase the performance`

			`//C[ iN + j] += A[iK + k] * B[ k * N + j];`
			`//C[ idx_iN + j] += Aik * B[ idx_kN + j]; // No significant effect`

			`} // for j`
			`} // for k`
			`} // for i`
			`} // for kk`

			`return NULL;`
			`}`

			`//----------------------------------------------------------------------------------------------`


			`void mat_mul(float _A, float _B, float *_C, int _M, int _N, int _K, int _num_threads) {`
			`A = _A, B = _B, C = _C;`
			`M = _M, N = _N, K = _K;`
			`num_threads = _num_threads;`

			`// TODO: create '_num_threads' pthreads`
			`// Divide-and-Conquer: USE Multiple threads`

			`// pthread_t thread;`
			`// pthread_create(&thread, NULL, mat_mul_thread, NULL);`
			`// pthread_join(thread, NULL);`


			`pthread_t * threads;`
			`threads = (pthread_t ) malloc(sizeof(pthread_t) num_threads);`

			`for (int i = 0; i < num_threads; i++) {`
			`int * pid = (int *) malloc(sizeof(int));`
			`*pid = i;`
			`pthread_create(&threads[i], NULL, mat_mul_thread, pid); // Send pid index as arguments for each thread`
			`}`

			`for (int i = 0; i < num_threads; i++) {`
			`pthread_join(threads[i], NULL);`
			`}`

			`}`