51 lines
1.9 KiB
Plaintext
51 lines
1.9 KiB
Plaintext
#include <cstdio>
|
|
|
|
#include "matmul.h"
|
|
|
|
#define CHECK_CUDA(call) \
|
|
do { \
|
|
cudaError_t status_ = call; \
|
|
if (status_ != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
|
|
cudaGetErrorString(status_)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while (0)
|
|
|
|
static __global__ void matmul_kernel(float *A, float *B, float *C, int M, int N,
|
|
int K) {
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
int j = blockDim.y * blockIdx.y + threadIdx.y;
|
|
if (i >= M || j >= N) return;
|
|
float sum = 0.0;
|
|
for (int k = 0; k < K; ++k) sum += A[i * K + k] * B[k * N + j];
|
|
C[i * N + j] = sum;
|
|
}
|
|
|
|
static float *A_gpu, *B_gpu, *C_gpu;
|
|
|
|
void matmul_naive_initialize(size_t M, size_t N, size_t K) {
|
|
CHECK_CUDA(cudaMalloc(&A_gpu, M * K * sizeof(float)));
|
|
CHECK_CUDA(cudaMalloc(&B_gpu, K * N * sizeof(float)));
|
|
CHECK_CUDA(cudaMalloc(&C_gpu, M * N * sizeof(float)));
|
|
}
|
|
|
|
void matmul_naive(float *A, float *B, float *C, size_t M, size_t N, size_t K) {
|
|
CHECK_CUDA(
|
|
cudaMemcpy(A_gpu, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
|
|
CHECK_CUDA(
|
|
cudaMemcpy(B_gpu, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
|
|
|
|
dim3 blockDim(32, 32);
|
|
dim3 gridDim((M + 32 - 1) / 32, (N + 32 - 1) / 32);
|
|
matmul_kernel<<<gridDim, blockDim>>>(A_gpu, B_gpu, C_gpu, M, N, K);
|
|
CHECK_CUDA(cudaGetLastError());
|
|
CHECK_CUDA(
|
|
cudaMemcpy(C, C_gpu, M * N * sizeof(float), cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
void matmul_naive_finalize(size_t M, size_t N, size_t K) {
|
|
CHECK_CUDA(cudaFree(A_gpu));
|
|
CHECK_CUDA(cudaFree(B_gpu));
|
|
CHECK_CUDA(cudaFree(C_gpu));
|
|
} |