// super super slow sgemm kernel by heehoon #define TS 32 #define WPTF 8 #define RTSF (TS/WPTF) #define WPTL 16 #define RTSL (TS/WPTL) #define PADDINGX 16 #define PADDINGY 16 __kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { const int row = get_local_id(0); // row index of C const int col = get_local_id(1); // row index of C const int global_row = TS*get_group_id(0)+row; const int global_col = TS*get_group_id(1)+col; __local float Asub[TS][TS]; __local float Bsub[TS][TS]; float acc[WPTF]; for(int w=0; w0 ? K/TS+1 : K/TS; for(int t=0;t=M || tile_col >= K) Asub[row+w*RTSF][col]=0.0f; else Asub[row+w*RTSF][col]=A[(global_row+w*RTSF)*K+tile_col]; if(tile_row+w*RTSF>=K||global_col>=N) Bsub[row+w*RTSF][col]=0.0f; else Bsub[row+w*RTSF][col]=B[(tile_row+w*RTSF)*N+global_col]; } barrier(CLK_LOCAL_MEM_FENCE); for(int k = 0; k < TS; k++) { for(int w=0;w=M || global_col >=N) continue; else C[(global_row+w*RTSF)*N+global_col]=acc[w]; } } __kernel void sgemma(__global float *A, __global float *B, __global float *C, int M, int N, int K) { const int row = get_local_id(0); // row index of C const int col = get_local_id(1); // row index of C const int global_row = TS*get_group_id(0)+row; const int global_col = TS*get_group_id(1)+col; __local float Asub[TS][TS]; __local float Bsub[TS][TS]; float acc[WPTL]; for(int w=0;w