// super super slow sgemm kernel by heehoon #define TS 32 #define WPT 8 #define RTS (TS/WPT) __kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { const int i = get_local_id(0); // row index of C const int j = get_local_id(1); // column index of C const int gi = TS * get_group_id(0) + i; const int gj = TS * get_group_id(1) + j; __local float Asub[TS][TS]; __local float Bsub[TS][TS]; float intermediate_val[WPT]; for(int w=0;w(gi+w*RTS) && N>gj) C[(gi + w * RTS)* N + gj] = intermediate_val[w]; } }