// super super slow sgemm kernel by heehoon /*__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { int i = get_global_id(0); // row index of C int j = get_global_id(1); // column index of C if (i >= M || j >= N) return; // boundary check C[i * N + j] = 0; for (int k = 0; k < K; k++) { C[i * N + j] += A[i * K + k] * B[k * N + j]; } }*/ // 2nd method - Tiled /* #define TS 32 __kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { // int i = get_global_id(0); // row index of C // int j = get_global_id(1); // column index of C //if (i >= M || j >= N) return; // boundary check const int row = get_local_id(0); // local row ID (0~31) const int col = get_local_id(1); // local col ID (0~31) const int global_row = TS * get_group_id(0) + row; // row ID (0~M) const int global_col = TS * get_group_id(1) + col; // row ID (0~N) if (global_row >= M || global_col >= N) return; // boundary check __local float Asub[TS][TS]; __local float Bsub[TS][TS]; float acc_val = 0.0f; const int num_tiles = K / TS; for(int t=0; t= M || global_col >= N) return; // boundary check __local float Asub[TS][TS]; __local float Bsub[TS][TS]; float acc_val[WPT] = {0.0f,}; const int num_tiles = K / TS; for(int t=0; t= M) || (global_col*WIDTH >= N)) return; //const int incomplete_ws_row = (M - global_row) < (M % TS); //const int incomplete_ws_col = (N - global_col*WIDTH) < (N % TS); //const int remain_ws_col = min(N - global_col*WIDTH, TS); const int remain_th_N = min(N - global_col*WIDTH, WIDTH); const int incomplete_ws_row = (M - global_row) <= (M % TS); const int incomplete_ws_col = (N - global_col*WIDTH) <= (N % TS); __local float8 Asub[TS][TS/WIDTH]; __local float8 Bsub[TS][TS/WIDTH]; float8 acc_val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; const int num_tiles = K/TS; //C[global_row*N + global_col] = 100.0f*incomplete_ws_row + 10.0f*incomplete_ws_col + 1.0f; //C[global_row*N + global_col] = remain_th_N*1.0f; //C[global_row*N + global_col] = num_tiles*-1.0f; //return; if(incomplete_ws_row == 0 && incomplete_ws_col == 0) { for(int t=0; t= N) return; // Local memory to fit a tile of TS*TS elements of A and B __local float Asub[TS][TS]; __local float Bsub[TS][TS]; // Initialise the accumulation registers float acc[WPT]; for (int w=0; w