// super super slow sgemm kernel by heehoon __kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { int i = get_global_id(0); // row index of C int j = get_global_id(1); // column index of C float value; if (i >= M || j >= N) return; // boundary check value = 0; for (int k = 0; k < K; k++) { value += A[i * K + k] * B[k * N + j]; } C[i * N + j] = value; }