chundoong-lab-ta/SamsungDS22/submissions/HW5/hj614.yoo/kernel.cl

109 lines
3.9 KiB
Common Lisp

// super super slow sgemm kernel by heehoon
#define WORK_SIZE (16)
#define VECT_SIZE (4)
__kernel void sgemm(__global float4 *A, __global float4 *B, __global float4 *C, int M, int N, int K , int base) {
int locRow = get_local_id(0); // local row index of C
int locCol = get_local_id(1); // local column index of C
int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C
int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C
__local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE];
__local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE];
float4 locC = {0.0f,0.0f,0.0f,0.0f};
float4 zero = {0.0f,0.0f,0.0f,0.0f};
int nTiles = K / WORK_SIZE;
for (int kk = 0; kk < nTiles; kk++) {
int kRow = WORK_SIZE * kk + locRow;
int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol;
if (gloRow < M && kCol < K / VECT_SIZE) // boundary check
locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol];
else
locA[locRow][locCol] = zero;
if (kRow < K && gloCol < N / VECT_SIZE) // boundary check
locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol];
else
locB[locRow][locCol] = zero;
barrier(CLK_LOCAL_MEM_FENCE);
float4 vecA, vecB;
float valA;
for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) {
vecA = locA[locRow][k];
for (int m = 0; m < VECT_SIZE; m++) {
vecB = locB[VECT_SIZE*k+m][locCol];
switch(m) {
case 0: valA = vecA.x; break;
case 1: valA = vecA.y; break;
case 2: valA = vecA.z; break;
case 3: valA = vecA.w; break;
}
locC.x += vecB.x * valA;
locC.y += vecB.y * valA;
locC.z += vecB.z * valA;
locC.w += vecB.w * valA;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (gloRow >= M || gloCol >= N / VECT_SIZE) return; // boundary check
C[gloRow*(N/VECT_SIZE)+gloCol]=locC;
}
__kernel void transpose(const int P, const int Q,
const __global float* input,
__global float* output) {
// Thread identifiers
const int locRow = get_local_id(0);
const int locCol = get_local_id(1);
const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P
const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q
// Set-up the local memory for shuffling
__local float buffer[WORK_SIZE][WORK_SIZE];
// Swap the x and y coordinates to perform the rotation (coalesced)
if (gloRow < P && gloCol < Q) {
buffer[locRow][locCol] = input[gloRow*P + gloCol];
}
// Synchronise all threads
barrier(CLK_LOCAL_MEM_FENCE);
// We don't have to swap the x and y thread indices here,
// because that's already done in the local memory
const int newRow = get_group_id(1)*WORK_SIZE + locCol;
const int newCol = get_group_id(0)*WORK_SIZE + locRow;
// Store the transposed result (coalesced)
if (newRow < Q && newCol < P) {
output[newRow*Q + newCol] = buffer[locRow][locCol];
}
}
__kernel void addPadding(const int P, const int Q, const int nP, const int nQ,
const __global float* input,
__global float* output) {
// Thread identifiers
const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q
float value=0.0f;
if (gloRow < nP && gloCol < nQ) {
if (gloRow < P && gloCol < Q)
value = input[gloRow*Q+gloCol];
output[gloRow*nQ+gloCol] = value;
}
}
__kernel void delPadding(const int P, const int Q, const int cP, const int cQ,
const __global float* input,
__global float* output) {
// Thread identifiers
const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/
if (gloRow < cP && gloCol < cQ) {
output[gloRow*cQ+gloCol] = input[gloRow*Q+gloCol];
}
}