92 lines
2.8 KiB
Plaintext
92 lines
2.8 KiB
Plaintext
#include <cuda_runtime.h>
|
|
|
|
#include <cstdio>
|
|
|
|
#include "mat_mul.h"
|
|
|
|
#define CUDA_CALL(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
#define TILE_WIDTH 16
|
|
|
|
__global__ void sgemm(float *A, float *B, float *C, int M, int N, int K) {
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y;
|
|
|
|
__shared__ float Ashared[TILE_WIDTH][TILE_WIDTH];
|
|
__shared__ float Bshared[TILE_WIDTH][TILE_WIDTH];
|
|
|
|
int Row = by * TILE_WIDTH + ty;
|
|
int Col = bx * TILE_WIDTH + tx;
|
|
|
|
float value = 0;
|
|
|
|
for (int tk=0; tk < (K+TILE_WIDTH-1) / TILE_WIDTH ; ++tk) {
|
|
|
|
if ((Row < M) && (tk*TILE_WIDTH + tx < K)) Ashared[ty][tx] = A[Row*K + tk*TILE_WIDTH + tx];
|
|
else Ashared[ty][tx] = 0;
|
|
|
|
if ((Col < N) && (tk*TILE_WIDTH + ty < K)) Bshared[ty][tx] = B[(tk*TILE_WIDTH + ty)*N + Col];
|
|
else Bshared[ty][tx] = 0;
|
|
|
|
__syncthreads();
|
|
|
|
for (int k = 0; k < TILE_WIDTH; ++k) {
|
|
value += Ashared[ty][k] * Bshared[k][tx];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
if ((Row < M) && (Col < N))
|
|
C[Row*N + Col] = value;
|
|
}
|
|
|
|
// Device (GPU) pointers
|
|
static float *a_d;
|
|
static float *b_d;
|
|
static float *c_d;
|
|
|
|
void mat_mul(float *_A, float *_B, float *_C, int M, int N, int K) {
|
|
// Launch kernel on every GPU
|
|
dim3 blockDim(TILE_WIDTH, TILE_WIDTH, 1);
|
|
dim3 gridDim((N + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH-1) / TILE_WIDTH, 1);
|
|
|
|
sgemm<<<gridDim, blockDim>>>(a_d, b_d, c_d, M, N, K);
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
|
|
void mat_mul_init(float *A, float *B, float *C, int M, int N, int K) {
|
|
// Allocate device memory
|
|
CUDA_CALL(cudaMalloc(&a_d, M * K * sizeof(float)));
|
|
CUDA_CALL(cudaMalloc(&b_d, K * N * sizeof(float)));
|
|
CUDA_CALL(cudaMalloc(&c_d, M * N * sizeof(float)));
|
|
|
|
// Upload A and B matrix to GPU
|
|
CUDA_CALL(cudaMemcpy(a_d, A, M * K * sizeof(float), cudaMemcpyHostToDevice));
|
|
CUDA_CALL(cudaMemcpy(b_d, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
CUDA_CALL(cudaDeviceSynchronize());
|
|
}
|
|
|
|
void mat_mul_final(float *A, float *B, float *C, int M, int N, int K) {
|
|
// Do any post-matmul cleanup work here.
|
|
|
|
// Download C matrix from GPU
|
|
CUDA_CALL(cudaMemcpy(C, c_d, M * N * sizeof(float), cudaMemcpyDeviceToHost));
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
CUDA_CALL(cudaDeviceSynchronize());
|
|
}
|