chundoong-lab-ta/SamsungDS23/HW5-ans/mat_mul.cu

92 lines
2.8 KiB
Plaintext

#include <cuda_runtime.h>
#include <cstdio>
#include "mat_mul.h"
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define TILE_WIDTH 16
__global__ void sgemm(float *A, float *B, float *C, int M, int N, int K) {
int tx = threadIdx.x;
int ty = threadIdx.y;
int bx = blockIdx.x;
int by = blockIdx.y;
__shared__ float Ashared[TILE_WIDTH][TILE_WIDTH];
__shared__ float Bshared[TILE_WIDTH][TILE_WIDTH];
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float value = 0;
for (int tk=0; tk < (K+TILE_WIDTH-1) / TILE_WIDTH ; ++tk) {
if ((Row < M) && (tk*TILE_WIDTH + tx < K)) Ashared[ty][tx] = A[Row*K + tk*TILE_WIDTH + tx];
else Ashared[ty][tx] = 0;
if ((Col < N) && (tk*TILE_WIDTH + ty < K)) Bshared[ty][tx] = B[(tk*TILE_WIDTH + ty)*N + Col];
else Bshared[ty][tx] = 0;
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k) {
value += Ashared[ty][k] * Bshared[k][tx];
}
__syncthreads();
}
if ((Row < M) && (Col < N))
C[Row*N + Col] = value;
}
// Device (GPU) pointers
static float *a_d;
static float *b_d;
static float *c_d;
void mat_mul(float *_A, float *_B, float *_C, int M, int N, int K) {
// Launch kernel on every GPU
dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
dim3 gridDim((N + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH-1) / TILE_WIDTH, 1);
sgemm<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K);
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CUDA_CALL( cudaDeviceSynchronize() );
}
void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {
// Allocate device memory
CUDA_CALL(cudaMalloc(&a_d, M * K * sizeof(float)));
CUDA_CALL(cudaMalloc(&b_d, K * N * sizeof(float)));
CUDA_CALL(cudaMalloc(&c_d, M * N * sizeof(float)));
// Upload A and B matrix to GPU
CUDA_CALL(cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CUDA_CALL(cudaDeviceSynchronize());
}
void mat_mul_final(float *A, float *B, float *C, int M, int N, int K) {
// Do any post-matmul cleanup work here.
// Download C matrix from GPU
CUDA_CALL(cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
CUDA_CALL(cudaDeviceSynchronize());
}