// super super slow sgemm kernel by heehoon #define WORK_SIZE (16) #define VECT_SIZE (4) __kernel void sgemm(__global float4 *A, __global float4 *B, __global float4 *C, int M, int N, int K , int base) { int locRow = get_local_id(0); // local row index of C int locCol = get_local_id(1); // local column index of C int gloRow = WORK_SIZE*get_group_id(0)+locRow; // global row index of C int gloCol = (WORK_SIZE/VECT_SIZE)*get_group_id(1)+locCol; // global column index of C __local float4 locA[WORK_SIZE][WORK_SIZE/VECT_SIZE]; __local float4 locB[WORK_SIZE][WORK_SIZE/VECT_SIZE]; float4 locC = {0.0f,0.0f,0.0f,0.0f}; float4 zero = {0.0f,0.0f,0.0f,0.0f}; int nTiles = K / WORK_SIZE; for (int kk = 0; kk < nTiles; kk++) { int kRow = WORK_SIZE * kk + locRow; int kCol = (WORK_SIZE/VECT_SIZE) * kk + locCol; if (gloRow < M && kCol < K / VECT_SIZE) // boundary check locA[locRow][locCol] = A[gloRow * (K / VECT_SIZE) + kCol]; else locA[locRow][locCol] = zero; if (kRow < K && gloCol < N / VECT_SIZE) // boundary check locB[locRow][locCol] = B[kRow * (N / VECT_SIZE) + gloCol]; else locB[locRow][locCol] = zero; barrier(CLK_LOCAL_MEM_FENCE); float4 vecA, vecB; float valA; for (int k = 0; k < WORK_SIZE/VECT_SIZE; k++) { vecA = locA[locRow][k]; for (int m = 0; m < VECT_SIZE; m++) { vecB = locB[VECT_SIZE*k+m][locCol]; switch(m) { case 0: valA = vecA.x; break; case 1: valA = vecA.y; break; case 2: valA = vecA.z; break; case 3: valA = vecA.w; break; } locC.x += vecB.x * valA; locC.y += vecB.y * valA; locC.z += vecB.z * valA; locC.w += vecB.w * valA; } } barrier(CLK_LOCAL_MEM_FENCE); } if (gloRow >= M || gloCol >= N / VECT_SIZE) return; // boundary check C[gloRow*(N/VECT_SIZE)+gloCol]=locC; } __kernel void transpose(const int P, const int Q, const __global float* input, __global float* output) { // Thread identifiers const int locRow = get_local_id(0); const int locCol = get_local_id(1); const int gloRow = get_group_id(0)*WORK_SIZE + locRow; // 0..P const int gloCol = get_group_id(1)*WORK_SIZE + locCol; // 0..Q // Set-up the local memory for shuffling __local float buffer[WORK_SIZE][WORK_SIZE]; // Swap the x and y coordinates to perform the rotation (coalesced) if (gloRow < P && gloCol < Q) { buffer[locRow][locCol] = input[gloRow*P + gloCol]; } // Synchronise all threads barrier(CLK_LOCAL_MEM_FENCE); // We don't have to swap the x and y thread indices here, // because that's already done in the local memory const int newRow = get_group_id(1)*WORK_SIZE + locCol; const int newCol = get_group_id(0)*WORK_SIZE + locRow; // Store the transposed result (coalesced) if (newRow < Q && newCol < P) { output[newRow*Q + newCol] = buffer[locRow][locCol]; } } __kernel void addPadding(const int P, const int Q, const int nP, const int nQ, const __global float* input, __global float* output) { // Thread identifiers const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q float value=0.0f; if (gloRow < nP && gloCol < nQ) { if (gloRow < P && gloCol < Q) value = input[gloRow*Q+gloCol]; output[gloRow*nQ+gloCol] = value; } } __kernel void delPadding(const int P, const int Q, const int cP, const int cQ, const __global float* input, __global float* output) { // Thread identifiers const int gloRow = get_group_id(0)*WORK_SIZE + get_local_id(0); // 0..P const int gloCol = get_group_id(1)*WORK_SIZE + get_local_id(1); // 0..Q/ if (gloRow < cP && gloCol < cQ) { output[gloRow*cQ+gloCol] = input[gloRow*Q+gloCol]; } }