109 lines
3.9 KiB
Common Lisp
109 lines
3.9 KiB
Common Lisp
// super super slow sgemm kernel by heehoon
|
|
#define WORK_SIZE (16)
|
|
#define VECT_SIZE (4)
|
|
|
|
__kernel void sgemm(__global float4 *A, __global float4 *B, __global float4 *C, int M, int N, int K , int base) {
|
|
int locRow = get_local_id(0); // local row index of C
|
|
int locCol = get_local_id(1); // local column index of C
|
|
int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C
|
|
int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C
|
|
|
|
__local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE];
|
|
__local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE];
|
|
|
|
float4 locC = {0.0f,0.0f,0.0f,0.0f};
|
|
float4 zero = {0.0f,0.0f,0.0f,0.0f};
|
|
int nTiles = K / WORK_SIZE;
|
|
for (int kk = 0; kk < nTiles; kk++) {
|
|
int kRow = WORK_SIZE * kk + locRow;
|
|
int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol;
|
|
|
|
if (gloRow < M && kCol < K / VECT_SIZE) // boundary check
|
|
locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol];
|
|
else
|
|
locA[locRow][locCol] = zero;
|
|
if (kRow < K && gloCol < N / VECT_SIZE) // boundary check
|
|
locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol];
|
|
else
|
|
locB[locRow][locCol] = zero;
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
float4 vecA, vecB;
|
|
float valA;
|
|
for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) {
|
|
vecA = locA[locRow][k];
|
|
for (int m = 0; m < VECT_SIZE; m++) {
|
|
vecB = locB[VECT_SIZE*k+m][locCol];
|
|
switch(m) {
|
|
case 0: valA = vecA.x; break;
|
|
case 1: valA = vecA.y; break;
|
|
case 2: valA = vecA.z; break;
|
|
case 3: valA = vecA.w; break;
|
|
}
|
|
locC.x += vecB.x * valA;
|
|
locC.y += vecB.y * valA;
|
|
locC.z += vecB.z * valA;
|
|
locC.w += vecB.w * valA;
|
|
}
|
|
}
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
}
|
|
if (gloRow >= M || gloCol >= N / VECT_SIZE) return; // boundary check
|
|
C[gloRow*(N/VECT_SIZE)+gloCol]=locC;
|
|
}
|
|
|
|
__kernel void transpose(const int P, const int Q,
|
|
const __global float* input,
|
|
__global float* output) {
|
|
// Thread identifiers
|
|
const int locRow = get_local_id(0);
|
|
const int locCol = get_local_id(1);
|
|
const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P
|
|
const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q
|
|
|
|
// Set-up the local memory for shuffling
|
|
__local float buffer[WORK_SIZE][WORK_SIZE];
|
|
|
|
// Swap the x and y coordinates to perform the rotation (coalesced)
|
|
if (gloRow < P && gloCol < Q) {
|
|
buffer[locRow][locCol] = input[gloRow*P + gloCol];
|
|
}
|
|
|
|
// Synchronise all threads
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
// We don't have to swap the x and y thread indices here,
|
|
// because that's already done in the local memory
|
|
const int newRow = get_group_id(1)*WORK_SIZE + locCol;
|
|
const int newCol = get_group_id(0)*WORK_SIZE + locRow;
|
|
|
|
// Store the transposed result (coalesced)
|
|
if (newRow < Q && newCol < P) {
|
|
output[newRow*Q + newCol] = buffer[locRow][locCol];
|
|
}
|
|
}
|
|
|
|
__kernel void addPadding(const int P, const int Q, const int nP, const int nQ,
|
|
const __global float* input,
|
|
__global float* output) {
|
|
// Thread identifiers
|
|
const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
|
|
const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q
|
|
float value=0.0f;
|
|
if (gloRow < nP && gloCol < nQ) {
|
|
if (gloRow < P && gloCol < Q)
|
|
value = input[gloRow*Q+gloCol];
|
|
output[gloRow*nQ+gloCol] = value;
|
|
}
|
|
}
|
|
|
|
__kernel void delPadding(const int P, const int Q, const int cP, const int cQ,
|
|
const __global float* input,
|
|
__global float* output) {
|
|
// Thread identifiers
|
|
const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P
|
|
const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/
|
|
if (gloRow < cP && gloCol < cQ) {
|
|
output[gloRow*cQ+gloCol] = input[gloRow*Q+gloCol];
|
|
}
|
|
} |