chundoong-lab-ta/SamsungDS22/submissions/HW5/ty.jeon/kernel.cl

// super super slow sgemm kernel by heehoon


#define TS 32
#define WPT 8
#define RTS (TS/WPT)

__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) {


    const int row = get_local_id(0);
    const int col = get_local_id(1);
    const int global_row = TS * get_group_id(0) + row;
    const int global_col = TS * get_group_id(1) + col;

    __local float Asub[TS][TS];
    __local float Bsub[TS][TS];

    float acc[WPT];
    for (int w=0; w<WPT; w++) {
        acc[w] = 0.0f;
    }

    const int numTiles = K/TS;
    for (int t=0; t<numTiles; t++) {

        for (int w=0; w<WPT; w++) {
            const int t_row = TS * t + row;
            const int t_col = TS * t + col;
            Asub[row + w*RTS][col] = A[(global_row + w*RTS)*K + t_col];
            Bsub[row + w*RTS][col] = B[(t_row + w*RTS)*N + global_col];
        }

        barrier(CLK_LOCAL_MEM_FENCE);

        for (int k=0; k<TS; k++) {
            for (int w=0; w<WPT; w++) {
                acc[w] += Asub[row + w*RTS][k] * Bsub[k][col];
            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }

    for (int w=0; w<WPT; w++) {
        C[(global_row + w*RTS)*N + global_col] = acc[w];
    }

    //if(global_row == 0 && global_col == 0){
    //  printf("A 0 0 : %f\n", A[0]);
    //  printf("A 0 1 : %f\n", A[1]);
    //  printf("B 0 0 : %f\n", B[0]);
    //  printf("B 1 0 : %f\n", B[N]);
    //}


}


__kernel void paddingAddZeroes(const int M, const int N, const __global float* input, const int M_XL, const int N_XL,__global float* output) {

    // Thread identifiers
    const int row = get_group_id(0)*16 + get_local_id(0); // 0..P_XL
    const int col = get_group_id(1)*16 + get_local_id(1); // 0..Q_XL

    // Check whether we are within bounds of the XL matrix
    if (row < M_XL && col < N_XL) {

        // Copy the input or pad a zero
        float value;
        if (row < M && col < N) {
            value = input[row*N + col];
        }
        else {
            value = 0.0f;
        }

        // Store the result
        output[row*N_XL + col] = value;
    }
}

__kernel void mergeResult(__global float * C1, __global float * C2, __global float * C3, __global float * C4, __global float * result, int M_xl, int N_xl, int M_single, int M, int N){

	const int row = get_group_id(0) * 32 + get_local_id(0); // M
	const int col = get_group_id(1) * 32 + get_local_id(1); // N

	//if(row == 0 && col == 0){
	// printf("M_xl : %d\n", M_xl);
	// printf("N_xl : %d\n", N_xl);
	// printf("M : %d\n", M);
	// printf("N : %d\n", N);
	//}

	if(row >= M || col >= N) return;

	float value;
	int aligned_row;

	if(row < M_single){
		aligned_row = row;
		value = C1[aligned_row * N_xl + col];
	}
	else if(row < 2*M_single){
		aligned_row = row - M_single;
		value = C2[aligned_row * N_xl + col];
	}
	else if(row < 3*M_single){
		aligned_row = row - 2*M_single;
		value = C3[aligned_row * N_xl + col];
	}
	else{
		aligned_row = row - 3*M_single;
		value = C4[aligned_row * N_xl + col];
	}

	result[row * N + col] = value;


}