// super super slow sgemm kernel by heehoon
#define WORK_SIZE (16)
#define VECT_SIZE (4)

__kernel void sgemm(__global float4 *A, __global float4 *B, __global float4 *C, int M, int N, int K , int base) {
  int locRow = get_local_id(0); // local row index of C
  int locCol = get_local_id(1); // local column index of C
  int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C
  int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C

  __local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE];
  __local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE];

  float4 locC = {0.0f,0.0f,0.0f,0.0f};
  float4 zero = {0.0f,0.0f,0.0f,0.0f};
  int nTiles = K / WORK_SIZE;
  for (int kk = 0; kk < nTiles; kk++) {
    int kRow = WORK_SIZE * kk + locRow;
    int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol;

    if (gloRow < M && kCol < K / VECT_SIZE) // boundary check
      locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol];
    else
      locA[locRow][locCol] = zero;
    if (kRow < K && gloCol < N / VECT_SIZE) // boundary check
      locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol];
    else
      locB[locRow][locCol] = zero;
    barrier(CLK_LOCAL_MEM_FENCE);
    
    float4 vecA, vecB;
    float valA;
    for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) {
      vecA = locA[locRow][k];
      for (int m = 0; m < VECT_SIZE; m++) {
        vecB = locB[VECT_SIZE*k+m][locCol];
        switch(m) {
          case 0: valA = vecA.x; break;
          case 1: valA = vecA.y; break;
          case 2: valA = vecA.z; break;
          case 3: valA = vecA.w; break;
        }
        locC.x += vecB.x * valA;
        locC.y += vecB.y * valA;
        locC.z += vecB.z * valA;
        locC.w += vecB.w * valA;
      }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (gloRow >= M || gloCol >= N / VECT_SIZE) return; // boundary check
  C[gloRow*(N/VECT_SIZE)+gloCol]=locC;
}

__kernel void transpose(const int P, const int Q,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int locRow = get_local_id(0);
  const int locCol = get_local_id(1);
  const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q

  // Set-up the local memory for shuffling
  __local float buffer[WORK_SIZE][WORK_SIZE];

  // Swap the x and y coordinates to perform the rotation (coalesced)
  if (gloRow < P && gloCol < Q) {
    buffer[locRow][locCol] = input[gloRow*P + gloCol];
  }

  // Synchronise all threads
  barrier(CLK_LOCAL_MEM_FENCE);

  // We don't have to swap the x and y thread indices here,
  // because that's already done in the local memory
  const int newRow = get_group_id(1)*WORK_SIZE + locCol;
  const int newCol = get_group_id(0)*WORK_SIZE + locRow;

  // Store the transposed result (coalesced)
  if (newRow < Q && newCol < P) {
    output[newRow*Q + newCol] = buffer[locRow][locCol];
  }
}

__kernel void addPadding(const int P, const int Q, const int nP, const int nQ,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q
  float value=0.0f;
  if (gloRow < nP && gloCol < nQ) {
    if (gloRow < P && gloCol < Q)
      value = input[gloRow*Q+gloCol];
    output[gloRow*nQ+gloCol] = value;
  }
}

__kernel void delPadding(const int P, const int Q, const int cP, const int cQ,
                        const __global float* input,
                        __global float* output) {
  // Thread identifiers
  const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
  const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/
  if (gloRow < cP && gloCol < cQ) {
    output[gloRow*cQ+gloCol] = input[gloRow*Q+gloCol];
  }
}