chundoong-lab-ta/SamsungDS22/submissions/HW5/hj614.yoo/kernel.cl

// super super slow sgemm kernel by heehoon
#define WORK_SIZE (16)
#define VECT_SIZE (4)

__kernel void sgemm(__global float4 *A, __global float4 *B, __global float4 *C, int M, int N, int K , int base) {
  int locRow = get_local_id(0); // local row index of C
  int locCol = get_local_id(1); // local column index of C
  int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C
  int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C

  __local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE];
  __local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE];

  float4 locC = {0.0f,0.0f,0.0f,0.0f};
  float4 zero = {0.0f,0.0f,0.0f,0.0f};
  int nTiles = K / WORK_SIZE;
  for (int kk = 0; kk < nTiles; kk++) {
    int kRow = WORK_SIZE * kk + locRow;
    int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol;

    if (gloRow < M && kCol < K / VECT_SIZE) // boundary check
      locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol];
    else
      locA[locRow][locCol] = zero;
    if (kRow < K && gloCol < N / VECT_SIZE) // boundary check
      locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol];
    else
      locB[locRow][locCol] = zero;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    float4 vecA, vecB;
    float valA;
    for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) {
      vecA = locA[locRow][k];
      for (int m = 0; m < VECT_SIZE; m++) {
        vecB = locB[VECT_SIZE*k+m][locCol];
        switch(m) {
          case 0: valA = vecA.x; break;
          case 1: valA = vecA.y; break;
          case 2: valA = vecA.z; break;
          case 3: valA = vecA.w; break;
        }
        locC.x += vecB.x * valA;
        locC.y += vecB.y * valA;
        locC.z += vecB.z * valA;
        locC.w += vecB.w * valA;
      }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (gloRow >= M || gloCol >= N / VECT_SIZE) return; // boundary check
  C[gloRow*(N/VECT_SIZE)+gloCol]=locC;
}

__kernel void transpose(const int P, const int Q,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int locRow = get_local_id(0);
  const int locCol = get_local_id(1);
  const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q

  // Set-up the local memory for shuffling
  __local float buffer[WORK_SIZE][WORK_SIZE];

  // Swap the x and y coordinates to perform the rotation (coalesced)
  if (gloRow < P && gloCol < Q) {
    buffer[locRow][locCol] = input[gloRow*P + gloCol];
  }

  // Synchronise all threads
  barrier(CLK_LOCAL_MEM_FENCE);

  // We don't have to swap the x and y thread indices here,
  // because that's already done in the local memory
  const int newRow = get_group_id(1)*WORK_SIZE + locCol;
  const int newCol = get_group_id(0)*WORK_SIZE + locRow;

  // Store the transposed result (coalesced)
  if (newRow < Q && newCol < P) {
    output[newRow*Q + newCol] = buffer[locRow][locCol];
  }
}

__kernel void addPadding(const int P, const int Q, const int nP, const int nQ,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q
  float value=0.0f;
  if (gloRow < nP && gloCol < nQ) {
    if (gloRow < P && gloCol < Q)
      value = input[gloRow*Q+gloCol];
    output[gloRow*nQ+gloCol] = value;
  }
}

__kernel void delPadding(const int P, const int Q, const int cP, const int cQ,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/
  if (gloRow < cP && gloCol < cQ) {
    output[gloRow*cQ+gloCol] = input[gloRow*Q+gloCol];
  }
}
. 2022-09-29 18:01:45 +09:00			`// super super slow sgemm kernel by heehoon`
			`#define WORK_SIZE (16)`
			`#define VECT_SIZE (4)`

			`__kernel void sgemm(__global float4 A, __global float4 B, __global float4 *C, int M, int N, int K , int base) {`
			`int locRow = get_local_id(0); // local row index of C`
			`int locCol = get_local_id(1); // local column index of C`
			`int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C`
			`int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C`

			`__local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE];`
			`__local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE];`

			`float4 locC = {0.0f,0.0f,0.0f,0.0f};`
			`float4 zero = {0.0f,0.0f,0.0f,0.0f};`
			`int nTiles = K / WORK_SIZE;`
			`for (int kk = 0; kk < nTiles; kk++) {`
			`int kRow = WORK_SIZE * kk + locRow;`
			`int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol;`

			`if (gloRow < M && kCol < K / VECT_SIZE) // boundary check`
			`locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol];`
			`else`
			`locA[locRow][locCol] = zero;`
			`if (kRow < K && gloCol < N / VECT_SIZE) // boundary check`
			`locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol];`
			`else`
			`locB[locRow][locCol] = zero;`
			`barrier(CLK_LOCAL_MEM_FENCE);`

			`float4 vecA, vecB;`
			`float valA;`
			`for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) {`
			`vecA = locA[locRow][k];`
			`for (int m = 0; m < VECT_SIZE; m++) {`
			`vecB = locB[VECT_SIZE*k+m][locCol];`
			`switch(m) {`
			`case 0: valA = vecA.x; break;`
			`case 1: valA = vecA.y; break;`
			`case 2: valA = vecA.z; break;`
			`case 3: valA = vecA.w; break;`
			`}`
			`locC.x += vecB.x * valA;`
			`locC.y += vecB.y * valA;`
			`locC.z += vecB.z * valA;`
			`locC.w += vecB.w * valA;`
			`}`
			`}`
			`barrier(CLK_LOCAL_MEM_FENCE);`
			`}`
			`if (gloRow >= M \|\| gloCol >= N / VECT_SIZE) return; // boundary check`
			`C[gloRow*(N/VECT_SIZE)+gloCol]=locC;`
			`}`

			`__kernel void transpose(const int P, const int Q,`
			`const __global float* input,`
			`__global float* output) {`
			`// Thread identifiers`
			`const int locRow = get_local_id(0);`
			`const int locCol = get_local_id(1);`
			`const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P`
			`const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q`

			`// Set-up the local memory for shuffling`
			`__local float buffer[WORK_SIZE][WORK_SIZE];`

			`// Swap the x and y coordinates to perform the rotation (coalesced)`
			`if (gloRow < P && gloCol < Q) {`
			`buffer[locRow][locCol] = input[gloRow*P + gloCol];`
			`}`

			`// Synchronise all threads`
			`barrier(CLK_LOCAL_MEM_FENCE);`

			`// We don't have to swap the x and y thread indices here,`
			`// because that's already done in the local memory`
			`const int newRow = get_group_id(1)*WORK_SIZE + locCol;`
			`const int newCol = get_group_id(0)*WORK_SIZE + locRow;`

			`// Store the transposed result (coalesced)`
			`if (newRow < Q && newCol < P) {`
			`output[newRow*Q + newCol] = buffer[locRow][locCol];`
			`}`
			`}`

			`__kernel void addPadding(const int P, const int Q, const int nP, const int nQ,`
			`const __global float* input,`
			`__global float* output) {`
			`// Thread identifiers`
			`const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P`
			`const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q`
			`float value=0.0f;`
			`if (gloRow < nP && gloCol < nQ) {`
			`if (gloRow < P && gloCol < Q)`
			`value = input[gloRow*Q+gloCol];`
			`output[gloRow*nQ+gloCol] = value;`
			`}`
			`}`

			`__kernel void delPadding(const int P, const int Q, const int cP, const int cQ,`
			`const __global float* input,`
			`__global float* output) {`
			`// Thread identifiers`
			`const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P`
			`const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/`
			`if (gloRow < cP && gloCol < cQ) {`
			`output[gloRowcQ+gloCol] = input[gloRowQ+gloCol];`
			`}`
			`}`