chundoong-lab-ta/SamsungDS22/submissions/HW5/junha96.jeon/kernel.cl

// super super slow sgemm kernel by heehoon

#define TS 32
#define WPT 8
#define RTS TS/WPT 

__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) {
  const int row = get_local_id(0); // row index of C
  const int col = get_local_id(1); // column index of C

  const int global_row = TS * get_group_id(0) + row;
  const int global_col = TS * get_group_id(1) + col;

  __local float Asub[TS][TS];
  __local float Bsub[TS][TS];


  float intermediate_val[WPT];
  for(int w=0; w<WPT; w++){ 
     intermediate_val[w]=0.0f;
  }

  int num_tiles;
  if(K%TS == 0)
    num_tiles = K/TS;
  else
    num_tiles = K/TS + 1;
  
  for(int t=0; t<num_tiles; t++){
       for(int w=0; w<WPT; w++) {
          const int t_row = TS*t + row;
          const int t_col = TS*t + col;
          if((global_row + w*RTS)<M && t_col<K)
            Asub[row + w*RTS][col] = A[(global_row + w*RTS)*K+ t_col];
          else
	    Asub[row + w*RTS][col] = 0.0f;
	  if((t_row + w*RTS)<K && global_col<N)
            Bsub[row + w*RTS][col] = B[(t_row + w*RTS)*N + global_col];
	  else
	    Bsub[row + w*RTS][col] = 0.0f;
       }
     barrier(CLK_LOCAL_MEM_FENCE);
     
     for(int k=0; k<TS; k++){
          for(int w=0; w< WPT; w++){
             intermediate_val[w] += Asub[row + w*RTS][k] * Bsub[k][col];	
          }
     }


     barrier(CLK_LOCAL_MEM_FENCE);
  }

  for(int w=0; w<WPT; w++) {
    if((global_row + w*RTS)<M && global_col<N)
      C[(global_row + w*RTS)*N + global_col] = intermediate_val[w];
  } 

}
. 2022-09-29 18:01:45 +09:00			`// super super slow sgemm kernel by heehoon`

			`#define TS 32`
			`#define WPT 8`
			`#define RTS TS/WPT`

			`__kernel void sgemm(__global float A, __global float B, __global float *C, int M, int N, int K) {`
			`const int row = get_local_id(0); // row index of C`
			`const int col = get_local_id(1); // column index of C`

			`const int global_row = TS * get_group_id(0) + row;`
			`const int global_col = TS * get_group_id(1) + col;`

			`__local float Asub[TS][TS];`
			`__local float Bsub[TS][TS];`


			`float intermediate_val[WPT];`
			`for(int w=0; w<WPT; w++){`
			`intermediate_val[w]=0.0f;`
			`}`

			`int num_tiles;`
			`if(K%TS == 0)`
			`num_tiles = K/TS;`
			`else`
			`num_tiles = K/TS + 1;`

			`for(int t=0; t<num_tiles; t++){`
			`for(int w=0; w<WPT; w++) {`
			`const int t_row = TS*t + row;`
			`const int t_col = TS*t + col;`
			`if((global_row + w*RTS)<M && t_col<K)`
			`Asub[row + wRTS][col] = A[(global_row + wRTS)*K+ t_col];`
			`else`
			`Asub[row + w*RTS][col] = 0.0f;`
			`if((t_row + w*RTS)<K && global_col<N)`
			`Bsub[row + wRTS][col] = B[(t_row + wRTS)*N + global_col];`
			`else`
			`Bsub[row + w*RTS][col] = 0.0f;`
			`}`
			`barrier(CLK_LOCAL_MEM_FENCE);`

			`for(int k=0; k<TS; k++){`
			`for(int w=0; w< WPT; w++){`
			`intermediate_val[w] += Asub[row + wRTS][k] Bsub[k][col];`
			`}`
			`}`


			`barrier(CLK_LOCAL_MEM_FENCE);`
			`}`

			`for(int w=0; w<WPT; w++) {`
			`if((global_row + w*RTS)<M && global_col<N)`
			`C[(global_row + wRTS)N + global_col] = intermediate_val[w];`
			`}`

			`}`