// A: M rows, K columns
// B: K rows, N columns
// C: M rows, N columns
//                         
//                   N     
//                o-----o  
//                |     |  
//              K | [B] |  
//                |     |  
//                o-----o  
//        K          N     
//    o-------o   o-----o  
//  M |  [A]  | M | [C] |  
//    |       |   |     |  
//    o-------o   o-----o  
//                         

#define TS_M    64                                  // The tile-size in dimension M
#define TS_N    64                                  // The tile-size in dimension N
#define TS_K    64                                  // The tile-size in dimension K
#define WPT_M   16                                  // The amount of work-per-thread in dimension M
#define WPT_N   8                                   // The amount of work-per-thread in dimension N

#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y)     ( CEIL_DIV((x),(y)) * (y) )

// void print_mat(float *m, int R, int C) {
//   for (int i = 0; i < R; ++i) { 
//     for (int j = 0; j < C; ++j) {
//       printf("%+.3f ", m[i * C + j]);
//     }
//     printf("\n");
//   }
// }

__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K)
{
    // Thread identifiers
    const int row = get_local_id(0);                            // Local row ID (max: TS_M/WPT_M)
    const int col = get_local_id(1);                            // Local col ID (max: TS_N/WPT_N)
    const int globalRow = TS_M * get_group_id(0) + row * WPT_M; // Row ID of C (0..M)
    const int globalCol = TS_N * get_group_id(1) + col * WPT_N; // Col ID of C (0..N)

    //printf("[R%03d, C%03d] GR=%d GC=%d\n", row, col, globalRow, globalCol);

    // Local memory to fit a tile of TS*TS elements of A and B
    __local float Asub[TS_M][TS_K];
    __local float Bsub[TS_K][TS_N];

    // Initialize the accumulation registers
    float acc[WPT_M][WPT_N];
    for (int wm = 0; wm < WPT_M; wm++)
    {
        for (int wn = 0; wn < WPT_N; wn++)
        {
            acc[wm][wn] = 0.0f;
        }
    }

    // Loop over all tiles
    const int numTiles = CEIL_DIV(K, TS_K);

    // if (row ==0 && col == 0)
    // {
    //     printf("Number of tiles: %d\n", numTiles);
    // }


    for (int t = 0; t < numTiles; t++)
    {
        const int rowInB = TS_M * t + row * WPT_M;
        const int colInA = TS_N * t + col * WPT_N;

        // Load one tile of A and B into local memory
        for (int wm = 0; wm < WPT_M; wm++)
        {
            for (int wn = 0; wn < WPT_N; wn++)
            {
                int r, c;

                r = globalRow + wm;
                c = colInA + wn;
                Asub[row * WPT_M + wm][col * WPT_N + wn] = (r >= M || c >= K) ? 0.0f : A[r * K + c];

                r = rowInB + wm;
                c = globalCol + wn;
                Bsub[row * WPT_M + wm][col * WPT_N + wn] = (r >= K || c >= N) ? 0.0f : B[r * N + c];
            }
        }

        // Synchronize to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // if (row ==0 && col == 0)
        // {
        //     printf("MATRIX Asub:\n");
        //     print_mat((float*)Asub, TS_M, TS_K);

        //     printf("MATRIX Bsub:\n");
        //     print_mat((float*)Bsub, TS_K, TS_N);
        // }

        // Loop over the values of a single tile
        for (int k = 0; k < TS_K; k++)
        {
            // Cache the values of Bsub in registers
            float bs[WPT_N];

            #pragma unroll
            for (int wn = 0; wn < WPT_N; wn++) {
                bs[wn] = Bsub[k][col * WPT_N + wn];
            }

            // Perform the computation
            #pragma unroll
            for (int wm = 0; wm < WPT_M; wm++)
            {
                float a = Asub[row * WPT_M + wm][k];

                #pragma unroll
                for (int wn = 0; wn < WPT_N; wn++)
                {
                    acc[wm][wn] += a * bs[wn];
                }
            }
        }

        // Synchronize before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // if (row ==0 && col == 0)
    // {
    //     printf("MATRIX acc:\n");
    //     print_mat((float*)acc, WPT_M, WPT_N);
    // }

    // Store the final results in C
    for (int wm = 0; wm < WPT_M; wm++)
    {
        for (int wn = 0; wn < WPT_N; wn++)
        {
            int r = globalRow + wm;
            int c = globalCol + wn;

            if (r < M && c < N)
            {
                C[r * N + c] = acc[wm][wn];
            }
        }
    }
}

/*
// super super slow sgemm kernel by heehoon
__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) {
  int i = get_global_id(0); // row index of C
  int j = get_global_id(1); // column index of C
  if (i >= M || j >= N) return; // boundary check

  C[i * N + j] = 0;
  for (int k = 0; k < K; k++) {
    C[i * N + j] += A[i * K + k] * B[k * N + j]; 
  }
}
*/