#include #include #include "mat_mul.h" #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define TILE_WIDTH 16 __global__ void sgemm(float *A, float *B, float *C, int M, int N, int K) { int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; __shared__ float Ashared[TILE_WIDTH][TILE_WIDTH]; __shared__ float Bshared[TILE_WIDTH][TILE_WIDTH]; int Row = by * TILE_WIDTH + ty; int Col = bx * TILE_WIDTH + tx; float value = 0; for (int tk=0; tk < (K+TILE_WIDTH-1) / TILE_WIDTH ; ++tk) { if ((Row < M) && (tk*TILE_WIDTH + tx < K)) Ashared[ty][tx] = A[Row*K + tk*TILE_WIDTH + tx]; else Ashared[ty][tx] = 0; if ((Col < N) && (tk*TILE_WIDTH + ty < K)) Bshared[ty][tx] = B[(tk*TILE_WIDTH + ty)*N + Col]; else Bshared[ty][tx] = 0; __syncthreads(); for (int k = 0; k < TILE_WIDTH; ++k) { value += Ashared[ty][k] * Bshared[k][tx]; } __syncthreads(); } if ((Row < M) && (Col < N)) C[Row*N + Col] = value; } // Device (GPU) pointers static float *a_d; static float *b_d; static float *c_d; void mat_mul(float *_A, float *_B, float *_C, int M, int N, int K) { // Launch kernel on every GPU dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1); dim3 gridDim((N + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH-1) / TILE_WIDTH, 1); sgemm<<>>(a_d, b_d, c_d, M, N, K); // DO NOT REMOVE; NEEDED FOR TIME MEASURE CUDA_CALL( cudaDeviceSynchronize() ); } void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) { // Allocate device memory CUDA_CALL(cudaMalloc(&a_d, M * K * sizeof(float))); CUDA_CALL(cudaMalloc(&b_d, K * N * sizeof(float))); CUDA_CALL(cudaMalloc(&c_d, M * N * sizeof(float))); // Upload A and B matrix to GPU CUDA_CALL(cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice)); // DO NOT REMOVE; NEEDED FOR TIME MEASURE CUDA_CALL(cudaDeviceSynchronize()); } void mat_mul_final(float *A, float *B, float *C, int M, int N, int K) { // Do any post-matmul cleanup work here. // Download C matrix from GPU CUDA_CALL(cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost)); // DO NOT REMOVE; NEEDED FOR TIME MEASURE CUDA_CALL(cudaDeviceSynchronize()); }