#define row_col_size_local 64 #define rows_per_local 32 __kernel void sgemm(__global float *A, __global float *B, __global float *C, int M, int N, int K) { const int local_row = get_local_id(0); // local row ID const int local_col = get_local_id(1); // local col ID const int global_row = row_col_size_local*get_group_id(0) + local_row; // c행렬의 row const int global_col = row_col_size_local*get_group_id(1) + local_col; // c행렬의 col //printf("local_id0: %d, local_id1: %d, global_id0: %d, global_id1:%d\n",get_local_id(0),get_local_id(1),get_group_id(0),get_group_id(1)); const int RTS = row_col_size_local/rows_per_local; //int rows_per_local_var = rows_per_local; __local float Asub[row_col_size_local][row_col_size_local]; __local float Bsub[row_col_size_local][row_col_size_local]; //int global_row_end = global_row+ ((rows_per_local-1)*RTS) ; //int global_col_end = global_col; float temp_val[rows_per_local]; for(int w=0; w=M || t_col >=K) { Asub[local_row+w*RTS][local_col] = 0.0f; } else { Asub[local_row + w*RTS][local_col] = A[(global_row + w*RTS)*K + t_col]; } if(t_row + (w*RTS) >=K || global_col >=N) { Bsub[local_row+w*RTS][local_col] = 0.0f; } else { Bsub[local_row + w*RTS][local_col] = B[(t_row + w*RTS)*N + global_col]; } //Asub[local_row+ w*RTS][local_col] = A[(global_row+ w*RTS) * K + t_col]; //Bsub[local_row+ w*RTS][local_col] = B[(t_row+ w*RTS) * N + global_col]; } barrier(CLK_LOCAL_MEM_FENCE); for(int k=0; k= M || j >= N) return; // boundary check C[i * N + j] = 0; for (int k = 0; k < K; k++) { C[i * N + j] += A[i * K + k] * B[k * N + j]; } } */