chundoong-lab-ta/SamsungDS22/submissions/HW5/jb114.seo/kernel.cl

// A: M rows, K columns
// B: K rows, N columns
// C: M rows, N columns
//                         
//                   N     
//                o-----o  
//                |     |  
//              K | [B] |  
//                |     |  
//                o-----o  
//        K          N     
//    o-------o   o-----o  
//  M |  [A]  | M | [C] |  
//    |       |   |     |  
//    o-------o   o-----o  
//                         

#define TS_M    64                                  // The tile-size in dimension M
#define TS_N    64                                  // The tile-size in dimension N
#define TS_K    64                                  // The tile-size in dimension K
#define WPT_M   16                                  // The amount of work-per-thread in dimension M
#define WPT_N   8                                   // The amount of work-per-thread in dimension N

#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y)     ( CEIL_DIV((x),(y)) * (y) )

// void print_mat(float *m, int R, int C) {
//   for (int i = 0; i < R; ++i) { 
//     for (int j = 0; j < C; ++j) {
//       printf("%+.3f ", m[i * C + j]);
//     }
//     printf("\n");
//   }
// }

__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K)
{
    // Thread identifiers
    const int row = get_local_id(0);                            // Local row ID (max: TS_M/WPT_M)
    const int col = get_local_id(1);                            // Local col ID (max: TS_N/WPT_N)
    const int globalRow = TS_M * get_group_id(0) + row * WPT_M; // Row ID of C (0..M)
    const int globalCol = TS_N * get_group_id(1) + col * WPT_N; // Col ID of C (0..N)

    //printf("[R%03d, C%03d] GR=%d GC=%d\n", row, col, globalRow, globalCol);

    // Local memory to fit a tile of TS*TS elements of A and B
    __local float Asub[TS_M][TS_K];
    __local float Bsub[TS_K][TS_N];

    // Initialize the accumulation registers
    float acc[WPT_M][WPT_N];
    for (int wm = 0; wm < WPT_M; wm++)
    {
        for (int wn = 0; wn < WPT_N; wn++)
        {
            acc[wm][wn] = 0.0f;
        }
    }

    // Loop over all tiles
    const int numTiles = CEIL_DIV(K, TS_K);

    // if (row ==0 && col == 0)
    // {
    //     printf("Number of tiles: %d\n", numTiles);
    // }


    for (int t = 0; t < numTiles; t++)
    {
        const int rowInB = TS_M * t + row * WPT_M;
        const int colInA = TS_N * t + col * WPT_N;

        // Load one tile of A and B into local memory
        for (int wm = 0; wm < WPT_M; wm++)
        {
            for (int wn = 0; wn < WPT_N; wn++)
            {
                int r, c;

                r = globalRow + wm;
                c = colInA + wn;
                Asub[row * WPT_M + wm][col * WPT_N + wn] = (r >= M || c >= K) ? 0.0f : A[r * K + c];

                r = rowInB + wm;
                c = globalCol + wn;
                Bsub[row * WPT_M + wm][col * WPT_N + wn] = (r >= K || c >= N) ? 0.0f : B[r * N + c];
            }
        }

        // Synchronize to make sure the tile is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // if (row ==0 && col == 0)
        // {
        //     printf("MATRIX Asub:\n");
        //     print_mat((float*)Asub, TS_M, TS_K);

        //     printf("MATRIX Bsub:\n");
        //     print_mat((float*)Bsub, TS_K, TS_N);
        // }

        // Loop over the values of a single tile
        for (int k = 0; k < TS_K; k++)
        {
            // Cache the values of Bsub in registers
            float bs[WPT_N];

            #pragma unroll
            for (int wn = 0; wn < WPT_N; wn++) {
                bs[wn] = Bsub[k][col * WPT_N + wn];
            }

            // Perform the computation
            #pragma unroll
            for (int wm = 0; wm < WPT_M; wm++)
            {
                float a = Asub[row * WPT_M + wm][k];

                #pragma unroll
                for (int wn = 0; wn < WPT_N; wn++)
                {
                    acc[wm][wn] += a * bs[wn];
                }
            }
        }

        // Synchronize before loading the next tile
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    // if (row ==0 && col == 0)
    // {
    //     printf("MATRIX acc:\n");
    //     print_mat((float*)acc, WPT_M, WPT_N);
    // }

    // Store the final results in C
    for (int wm = 0; wm < WPT_M; wm++)
    {
        for (int wn = 0; wn < WPT_N; wn++)
        {
            int r = globalRow + wm;
            int c = globalCol + wn;

            if (r < M && c < N)
            {
                C[r * N + c] = acc[wm][wn];
            }
        }
    }
}

/*
// super super slow sgemm kernel by heehoon
__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) {
  int i = get_global_id(0); // row index of C
  int j = get_global_id(1); // column index of C
  if (i >= M || j >= N) return; // boundary check

  C[i * N + j] = 0;
  for (int k = 0; k < K; k++) {
    C[i * N + j] += A[i * K + k] * B[k * N + j]; 
  }
}
*/
. 2022-09-29 18:01:45 +09:00			`// A: M rows, K columns`
			`// B: K rows, N columns`
			`// C: M rows, N columns`
			`//`
			`// N`
			`// o-----o`
			`// \| \|`
			`// K \| [B] \|`
			`// \| \|`
			`// o-----o`
			`// K N`
			`// o-------o o-----o`
			`// M \| [A] \| M \| [C] \|`
			`// \| \| \| \|`
			`// o-------o o-----o`
			`//`

			`#define TS_M 64 // The tile-size in dimension M`
			`#define TS_N 64 // The tile-size in dimension N`
			`#define TS_K 64 // The tile-size in dimension K`
			`#define WPT_M 16 // The amount of work-per-thread in dimension M`
			`#define WPT_N 8 // The amount of work-per-thread in dimension N`

			`#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )`
			`#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )`

			`// void print_mat(float *m, int R, int C) {`
			`// for (int i = 0; i < R; ++i) {`
			`// for (int j = 0; j < C; ++j) {`
			`// printf("%+.3f ", m[i * C + j]);`
			`// }`
			`// printf("\n");`
			`// }`
			`// }`

			`__kernel void sgemm(__global float A, __global float B, __global float *C, int M, int N, int K)`
			`{`
			`// Thread identifiers`
			`const int row = get_local_id(0); // Local row ID (max: TS_M/WPT_M)`
			`const int col = get_local_id(1); // Local col ID (max: TS_N/WPT_N)`
			`const int globalRow = TS_M * get_group_id(0) + row * WPT_M; // Row ID of C (0..M)`
			`const int globalCol = TS_N * get_group_id(1) + col * WPT_N; // Col ID of C (0..N)`

			`//printf("[R%03d, C%03d] GR=%d GC=%d\n", row, col, globalRow, globalCol);`

			`// Local memory to fit a tile of TS*TS elements of A and B`
			`__local float Asub[TS_M][TS_K];`
			`__local float Bsub[TS_K][TS_N];`

			`// Initialize the accumulation registers`
			`float acc[WPT_M][WPT_N];`
			`for (int wm = 0; wm < WPT_M; wm++)`
			`{`
			`for (int wn = 0; wn < WPT_N; wn++)`
			`{`
			`acc[wm][wn] = 0.0f;`
			`}`
			`}`

			`// Loop over all tiles`
			`const int numTiles = CEIL_DIV(K, TS_K);`

			`// if (row ==0 && col == 0)`
			`// {`
			`// printf("Number of tiles: %d\n", numTiles);`
			`// }`


			`for (int t = 0; t < numTiles; t++)`
			`{`
			`const int rowInB = TS_M * t + row * WPT_M;`
			`const int colInA = TS_N * t + col * WPT_N;`

			`// Load one tile of A and B into local memory`
			`for (int wm = 0; wm < WPT_M; wm++)`
			`{`
			`for (int wn = 0; wn < WPT_N; wn++)`
			`{`
			`int r, c;`

			`r = globalRow + wm;`
			`c = colInA + wn;`
			`Asub[row * WPT_M + wm][col * WPT_N + wn] = (r >= M \|\| c >= K) ? 0.0f : A[r * K + c];`

			`r = rowInB + wm;`
			`c = globalCol + wn;`
			`Bsub[row * WPT_M + wm][col * WPT_N + wn] = (r >= K \|\| c >= N) ? 0.0f : B[r * N + c];`
			`}`
			`}`

			`// Synchronize to make sure the tile is loaded`
			`barrier(CLK_LOCAL_MEM_FENCE);`

			`// if (row ==0 && col == 0)`
			`// {`
			`// printf("MATRIX Asub:\n");`
			`// print_mat((float*)Asub, TS_M, TS_K);`

			`// printf("MATRIX Bsub:\n");`
			`// print_mat((float*)Bsub, TS_K, TS_N);`
			`// }`

			`// Loop over the values of a single tile`
			`for (int k = 0; k < TS_K; k++)`
			`{`
			`// Cache the values of Bsub in registers`
			`float bs[WPT_N];`

			`#pragma unroll`
			`for (int wn = 0; wn < WPT_N; wn++) {`
			`bs[wn] = Bsub[k][col * WPT_N + wn];`
			`}`

			`// Perform the computation`
			`#pragma unroll`
			`for (int wm = 0; wm < WPT_M; wm++)`
			`{`
			`float a = Asub[row * WPT_M + wm][k];`

			`#pragma unroll`
			`for (int wn = 0; wn < WPT_N; wn++)`
			`{`
			`acc[wm][wn] += a * bs[wn];`
			`}`
			`}`
			`}`

			`// Synchronize before loading the next tile`
			`barrier(CLK_LOCAL_MEM_FENCE);`
			`}`

			`// if (row ==0 && col == 0)`
			`// {`
			`// printf("MATRIX acc:\n");`
			`// print_mat((float*)acc, WPT_M, WPT_N);`
			`// }`

			`// Store the final results in C`
			`for (int wm = 0; wm < WPT_M; wm++)`
			`{`
			`for (int wn = 0; wn < WPT_N; wn++)`
			`{`
			`int r = globalRow + wm;`
			`int c = globalCol + wn;`

			`if (r < M && c < N)`
			`{`
			`C[r * N + c] = acc[wm][wn];`
			`}`
			`}`
			`}`
			`}`

			`/*`
			`// super super slow sgemm kernel by heehoon`
			`__kernel void sgemm(__global float A, __global float B, __global float *C, int M, int N, int K) {`
			`int i = get_global_id(0); // row index of C`
			`int j = get_global_id(1); // column index of C`
			`if (i >= M \|\| j >= N) return; // boundary check`

			`C[i * N + j] = 0;`
			`for (int k = 0; k < K; k++) {`
			`C[i * N + j] += A[i * K + k] * B[k * N + j];`
			`}`
			`}`
			`*/`