// super super slow sgemm kernel by heehoon
#define TS (32)
#define WPT (8)
#define RTS (4)

__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) {

	int row = get_local_id(0);
	int col = get_local_id(1);

	int global_row = TS * get_group_id(0) + row;
	int global_col = TS * get_group_id(1) + col;

	__local float Asub[TS][TS];
	__local float Bsub[TS][TS];

	int q = K / TS;
	int r = K % TS;


	float temp[WPT];
	for (int w = 0; w < WPT; w++)
		temp[w] = 0.0f;

	if(r != 0)
		q = q + 1;

	for (int t = 0; t < q; t++) {
		for (int w = 0; w < WPT; w++) {
			int t_row = TS * t + row;
			int t_col = TS * t + col;

			if (t_col < K && (global_row + (w * RTS)) < M)
				Asub[row + w * RTS][col] = A[(global_row + (w * RTS))* K + t_col];
			else 
				Asub[row + w * RTS][col] = 0;

			if ((t_row + (w * RTS)) < K && global_col < N)
				Bsub[row + w * RTS][col] = B[(t_row + (w * RTS))* N + global_col];
			else
				Bsub[row + w * RTS][col] = 0;
		}

		barrier(CLK_LOCAL_MEM_FENCE);

		for (int k = 0; k < TS; k++) {
			for (int w = 0; w < WPT; w++) {
				temp[w] += Asub[row + (w * RTS)][k] * Bsub[k][col];
			}
		}

		barrier(CLK_LOCAL_MEM_FENCE);
	}


	for (int w = 0; w < WPT; w++) {
		if (global_row + (w * RTS) >= M || global_col >= N) {
			return;
		}

		C[(global_row + (w * RTS)) * N + global_col] = temp[w];
	}
}