chundoong-lab-ta/SamsungDS22/submissions/HW2/jicheol.kim/mat_mul.cpp

#include "mat_mul.h"

#include <cstdlib>
#include <cstdio>
#include <pthread.h>
#include <algorithm>
#include <stdlib.h>
#include <unistd.h>
using namespace std;

static float *A, *B, *C;
static int M, N, K;
static int num_threads;
 
typedef struct thread_info{
    int start, end;
}thread_info; 

static void* mat_mul_thread(void *data) {
  
  thread_info *in = (thread_info *)data;
  // TODO: parallelize & optimize matrix multiplication
  int start_row = in->start;
  int end_row = in->end;
  int block = 32;
  float temp = 0;
  float check = N/8;

  if(check !=0){
      for(int kk = 0; kk < K; kk+= block){  
          for(int i = start_row; i < min(end_row,M); ++i){
              for(int k= kk; k< min(kk + block, K); ++k){
	          temp = A[i*K+k];
	          for(int j=0; j< N; ++j){
	              C[i*N+j] += temp * B[k*N+j];
	         }
	     }
          }
      }
  }else{
      for(int kk = 0 ; kk < K; kk += block){
        //  for(int jj = 0; jj < N; jj += block){
              for(int i = start_row; i < min(end_row,M); i++){
                  for(int k = kk; k < min(kk + block, K); k++){
                      temp = A[i*K+k]; 
	              for(int j= 0; j< N; j+=8){
	                  C[i*N+j+0] += temp * B[k*N+j+0];
		          C[i*N+j+1] += temp * B[k*N+j+1];
		          C[i*N+j+2] += temp * B[k*N+j+2];
		          C[i*N+j+3] += temp * B[k*N+j+3];
		          C[i*N+j+4] += temp * B[k*N+j+4];
		          C[i*N+j+5] += temp * B[k*N+j+5];
		          C[i*N+j+6] += temp * B[k*N+j+6];
		          C[i*N+j+7] += temp * B[k*N+j+7];
		        /*  C[i*N+j+8] += temp * B[k*N+j+8];
		          C[i*N+j+9] += temp * B[k*N+j+9];
		          C[i*N+j+10] += temp * B[k*N+j+10];
		          C[i*N+j+11] += temp * B[k*N+j+11];
		          C[i*N+j+12] += temp * B[k*N+j+12];
		          C[i*N+j+13] += temp * B[k*N+j+13];
		          C[i*N+j+14] += temp * B[k*N+j+14];
		          C[i*N+j+15] += temp * B[k*N+j+15];
			  C[i*N+j+16] += temp * B[k*N+j+16];
			  C[i*N+j+17] += temp * B[k*N+j+17];
			  C[i*N+j+18] += temp * B[k*N+j+18];
			  C[i*N+j+19] += temp * B[k*N+j+19];
			  C[i*N+j+20] += temp * B[k*N+j+20];
			  C[i*N+j+21] += temp * B[k*N+j+21];
			  C[i*N+j+22] += temp * B[k*N+j+22];
		          C[i*N+j+23] += temp * B[k*N+j+23];
			  C[i*N+j+24] += temp * B[k*N+j+24];
			  C[i*N+j+25] += temp * B[k*N+j+25];
			  C[i*N+j+26] += temp * B[k*N+j+26];
			  C[i*N+j+27] += temp * B[k*N+j+27];
			  C[i*N+j+28] += temp * B[k*N+j+28];
			  C[i*N+j+29] += temp * B[k*N+j+29];
			  C[i*N+j+30] += temp * B[k*N+j+30];
			  C[i*N+j+31] += temp * B[k*N+j+31];*/
		      }
		  }
	
	 //    }
          }
      }
  }      
  return NULL;
}

void mat_mul(float *_A, float *_B, float *_C, int _M, int _N, int _K, int _num_threads) {
  A = _A, B = _B, C = _C;
  M = _M, N = _N, K = _K;
  num_threads = _num_threads;
 
  int n_split = 0;
  int n_work = 0;
 
  float check = M/num_threads;
 
  if(check !=0)
     n_work = M < num_threads ? 1 : M/num_threads+1;
  else
     n_work = M < num_threads ? 1 : M/num_threads;

  n_split = M < num_threads ? M : num_threads;
  
  pthread_t threads[n_split];
  thread_info t_pool[n_split];
  //threads = (pthread_t *) malloc(sizeof(pthread_t)* n_split);
 
  for(int i = 0 ; i < n_split; i++){
      thread_info tinfo;
      //struct thread_info *tinfo = (struct thread_info *) malloc(sizeof(struct thread_info));
      tinfo.start = i*n_work;
      // tinfo->end = i == n_split - 1? M:(i+1)*n_split;   //tinfo.start + n_work;
      tinfo.end = tinfo.start + n_work;
      t_pool[i] = tinfo;      
      pthread_create(&threads[i], NULL, mat_mul_thread,(void*)&t_pool[i]);
 
  }
  
  for(int i = 0; i< n_split; i++)
  {
      pthread_join(threads[i], NULL);
  }

}
. 2022-09-29 18:01:45 +09:00			`#include "mat_mul.h"`

			`#include <cstdlib>`
			`#include <cstdio>`
			`#include <pthread.h>`
			`#include <algorithm>`
			`#include <stdlib.h>`
			`#include <unistd.h>`
			`using namespace std;`

			`static float A, B, *C;`
			`static int M, N, K;`
			`static int num_threads;`

			`typedef struct thread_info{`
			`int start, end;`
			`}thread_info;`

			`static void* mat_mul_thread(void *data) {`

			`thread_info in = (thread_info )data;`
			`// TODO: parallelize & optimize matrix multiplication`
			`int start_row = in->start;`
			`int end_row = in->end;`
			`int block = 32;`
			`float temp = 0;`
			`float check = N/8;`

			`if(check !=0){`
			`for(int kk = 0; kk < K; kk+= block){`
			`for(int i = start_row; i < min(end_row,M); ++i){`
			`for(int k= kk; k< min(kk + block, K); ++k){`
			`temp = A[i*K+k];`
			`for(int j=0; j< N; ++j){`
			`C[iN+j] += temp B[k*N+j];`
			`}`
			`}`
			`}`
			`}`
			`}else{`
			`for(int kk = 0 ; kk < K; kk += block){`
			`// for(int jj = 0; jj < N; jj += block){`
			`for(int i = start_row; i < min(end_row,M); i++){`
			`for(int k = kk; k < min(kk + block, K); k++){`
			`temp = A[i*K+k];`
			`for(int j= 0; j< N; j+=8){`
			`C[iN+j+0] += temp B[k*N+j+0];`
			`C[iN+j+1] += temp B[k*N+j+1];`
			`C[iN+j+2] += temp B[k*N+j+2];`
			`C[iN+j+3] += temp B[k*N+j+3];`
			`C[iN+j+4] += temp B[k*N+j+4];`
			`C[iN+j+5] += temp B[k*N+j+5];`
			`C[iN+j+6] += temp B[k*N+j+6];`
			`C[iN+j+7] += temp B[k*N+j+7];`
			`/* C[iN+j+8] += temp B[k*N+j+8];`
			`C[iN+j+9] += temp B[k*N+j+9];`
			`C[iN+j+10] += temp B[k*N+j+10];`
			`C[iN+j+11] += temp B[k*N+j+11];`
			`C[iN+j+12] += temp B[k*N+j+12];`
			`C[iN+j+13] += temp B[k*N+j+13];`
			`C[iN+j+14] += temp B[k*N+j+14];`
			`C[iN+j+15] += temp B[k*N+j+15];`
			`C[iN+j+16] += temp B[k*N+j+16];`
			`C[iN+j+17] += temp B[k*N+j+17];`
			`C[iN+j+18] += temp B[k*N+j+18];`
			`C[iN+j+19] += temp B[k*N+j+19];`
			`C[iN+j+20] += temp B[k*N+j+20];`
			`C[iN+j+21] += temp B[k*N+j+21];`
			`C[iN+j+22] += temp B[k*N+j+22];`
			`C[iN+j+23] += temp B[k*N+j+23];`
			`C[iN+j+24] += temp B[k*N+j+24];`
			`C[iN+j+25] += temp B[k*N+j+25];`
			`C[iN+j+26] += temp B[k*N+j+26];`
			`C[iN+j+27] += temp B[k*N+j+27];`
			`C[iN+j+28] += temp B[k*N+j+28];`
			`C[iN+j+29] += temp B[k*N+j+29];`
			`C[iN+j+30] += temp B[k*N+j+30];`
			`C[iN+j+31] += temp B[kN+j+31];/`
			`}`
			`}`

			`// }`
			`}`
			`}`
			`}`
			`return NULL;`
			`}`

			`void mat_mul(float _A, float _B, float *_C, int _M, int _N, int _K, int _num_threads) {`
			`A = _A, B = _B, C = _C;`
			`M = _M, N = _N, K = _K;`
			`num_threads = _num_threads;`

			`int n_split = 0;`
			`int n_work = 0;`

			`float check = M/num_threads;`

			`if(check !=0)`
			`n_work = M < num_threads ? 1 : M/num_threads+1;`
			`else`
			`n_work = M < num_threads ? 1 : M/num_threads;`

			`n_split = M < num_threads ? M : num_threads;`

			`pthread_t threads[n_split];`
			`thread_info t_pool[n_split];`
			`//threads = (pthread_t ) malloc(sizeof(pthread_t) n_split);`

			`for(int i = 0 ; i < n_split; i++){`
			`thread_info tinfo;`
			`//struct thread_info tinfo = (struct thread_info ) malloc(sizeof(struct thread_info));`
			`tinfo.start = i*n_work;`
			`// tinfo->end = i == n_split - 1? M:(i+1)*n_split; //tinfo.start + n_work;`
			`tinfo.end = tinfo.start + n_work;`
			`t_pool[i] = tinfo;`
			`pthread_create(&threads[i], NULL, mat_mul_thread,(void*)&t_pool[i]);`

			`}`

			`for(int i = 0; i< n_split; i++)`
			`{`
			`pthread_join(threads[i], NULL);`
			`}`

			`}`