chundoong-lab-ta/SamsungDS22/submissions/HW5/c.w.son/kernel.cl

56 lines
1.5 KiB
Common Lisp

// super super slow sgemm kernel by heehoon
__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K, int start_m, int end_m, int dev) {
//int size_i = get_global_size(0);
//int size_j = get_global_size(1);
int i = get_global_id(0); // row index of C
int j = get_global_id(1); // column index of C
//int i = get_group_id(0); // row index of C
//int j = get_group_id(1); // column index of C
if (i >= M || j >= N) return; // boundary check
//printf("Size R %d, C %d\n", size_i, size_j);
/*
int i_m = i + M;
float acc = 0.0;
for (int k = 0; k < K; k++) {
acc += A[i_m * K + k] * B[k * N + j];
}
C[i * N + j] = acc;
*/
/*
C[i * N + j] = 0;
for (int k = 0; k < K; k++) {
//printf("%d, %d, %d, : %+.3f / %+.3f / %+.3f \n", i, j, k, C[i * N + j], A[i * K + k], B[k * N + j]);
C[i * N + j] += A[i * K + k] * B[k * N + j];
//printf("%+.3f \n", C[i * N + j]);
}
*/
/*
float acc = 0.0;
for (int k = 0; k < K; k++) {
acc += A[i * K + k] * B[k * N + j];
}
C[i * N + j] = acc;
*/
int size_m = end_m-start_m;
//printf("Dev (%d) size (%d) R (%d), C (%d)\n", dev, size_m, i, j);
//if (i >= (dev+1)*size_m || i< (dev*size_m)) return;
if (i >= end_m || i < start_m) return;
if (j>=N) return;
float acc = 0.0;
for (int k = 0; k < K; k++) {
acc += A[i * K + k] * B[k * N + j];
}
C[(i-start_m) * N + j] = acc;
//printf("Pass Dev (%d) R (%d)CR(%d), C (%d), Data (%.3f)\n", dev, i, (i-start_m), j, acc);
}