56 lines
1.5 KiB
Common Lisp
56 lines
1.5 KiB
Common Lisp
|
// super super slow sgemm kernel by heehoon
|
||
|
__kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K, int start_m, int end_m, int dev) {
|
||
|
//int size_i = get_global_size(0);
|
||
|
//int size_j = get_global_size(1);
|
||
|
|
||
|
int i = get_global_id(0); // row index of C
|
||
|
int j = get_global_id(1); // column index of C
|
||
|
|
||
|
//int i = get_group_id(0); // row index of C
|
||
|
//int j = get_group_id(1); // column index of C
|
||
|
if (i >= M || j >= N) return; // boundary check
|
||
|
|
||
|
//printf("Size R %d, C %d\n", size_i, size_j);
|
||
|
/*
|
||
|
int i_m = i + M;
|
||
|
|
||
|
float acc = 0.0;
|
||
|
for (int k = 0; k < K; k++) {
|
||
|
acc += A[i_m * K + k] * B[k * N + j];
|
||
|
}
|
||
|
|
||
|
C[i * N + j] = acc;
|
||
|
*/
|
||
|
/*
|
||
|
C[i * N + j] = 0;
|
||
|
for (int k = 0; k < K; k++) {
|
||
|
//printf("%d, %d, %d, : %+.3f / %+.3f / %+.3f \n", i, j, k, C[i * N + j], A[i * K + k], B[k * N + j]);
|
||
|
C[i * N + j] += A[i * K + k] * B[k * N + j];
|
||
|
//printf("%+.3f \n", C[i * N + j]);
|
||
|
}
|
||
|
*/
|
||
|
/*
|
||
|
float acc = 0.0;
|
||
|
for (int k = 0; k < K; k++) {
|
||
|
acc += A[i * K + k] * B[k * N + j];
|
||
|
}
|
||
|
|
||
|
C[i * N + j] = acc;
|
||
|
*/
|
||
|
|
||
|
int size_m = end_m-start_m;
|
||
|
//printf("Dev (%d) size (%d) R (%d), C (%d)\n", dev, size_m, i, j);
|
||
|
//if (i >= (dev+1)*size_m || i< (dev*size_m)) return;
|
||
|
if (i >= end_m || i < start_m) return;
|
||
|
if (j>=N) return;
|
||
|
|
||
|
float acc = 0.0;
|
||
|
for (int k = 0; k < K; k++) {
|
||
|
acc += A[i * K + k] * B[k * N + j];
|
||
|
}
|
||
|
|
||
|
C[(i-start_m) * N + j] = acc;
|
||
|
|
||
|
//printf("Pass Dev (%d) R (%d)CR(%d), C (%d), Data (%.3f)\n", dev, i, (i-start_m), j, acc);
|
||
|
}
|