chundoong-lab-ta/APWS23/ans/matmul_multigpu_ans.cu

105 lines
3.6 KiB
Plaintext

#include <cstdio>
#include "matmul.h"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
static __global__ void matmul_kernel(float *A, float *B, float *C, int M, int N,
int K) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= M || j >= N) return;
float sum = 0.0;
for (int k = 0; k < K; ++k) sum += A[i * K + k] * B[k * N + j];
C[i * N + j] = sum;
}
#define NGPU 4
#define EVENTS_PER_GPU 1 // Increase as needed
static size_t Mbegin[NGPU], Mend[NGPU];
static size_t ngpu;
static cudaStream_t streams[NGPU];
static cudaEvent_t events[NGPU][EVENTS_PER_GPU];
static float *A_gpu[NGPU], *B_gpu[NGPU], *C_gpu[NGPU];
void matmul_multigpu_initialize(size_t M, size_t N, size_t K) {
ngpu = 4;
for (size_t i = 0; i < ngpu; i++) {
Mbegin[i] = M / ngpu * i;
Mend[i] = M / ngpu * (i + 1);
if (i == ngpu - 1) Mend[i] = M;
}
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaStreamCreate(&streams[i]));
for (int j = 0; j < EVENTS_PER_GPU; j++) {
CHECK_CUDA(cudaEventCreate(&events[i][j]));
}
}
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(
cudaMalloc(&A_gpu[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));
CHECK_CUDA(cudaMalloc(&B_gpu[i], K * N * sizeof(float)));
CHECK_CUDA(
cudaMalloc(&C_gpu[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));
}
}
void matmul_multigpu(float *A, float *B, float *C, size_t M, size_t N,
size_t K) {
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMemcpyAsync(A_gpu[i], &A[Mbegin[i] * K],
(Mend[i] - Mbegin[i]) * K * sizeof(float),
cudaMemcpyHostToDevice, streams[i]));
CHECK_CUDA(cudaMemcpyAsync(B_gpu[i], B, K * N * sizeof(float),
cudaMemcpyHostToDevice, streams[i]));
}
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
dim3 blockDim(32, 32);
dim3 gridDim((Mend[i] - Mbegin[i] + 32 - 1) / 32, (N + 32 - 1) / 32);
matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(
A_gpu[i], B_gpu[i], C_gpu[i], Mend[i] - Mbegin[i], N, K);
CHECK_CUDA(cudaGetLastError());
}
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaMemcpyAsync(&C[Mbegin[i] * N], C_gpu[i],
(Mend[i] - Mbegin[i]) * N * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]));
}
for (size_t i = 0; i < ngpu; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(streams[i]);
}
}
void matmul_multigpu_finalize(size_t M, size_t N, size_t K) {
for (size_t i = 0; i < ngpu; i++) {
CHECK_CUDA(cudaSetDevice(i));
CHECK_CUDA(cudaFree(A_gpu[i]));
CHECK_CUDA(cudaFree(B_gpu[i]));
CHECK_CUDA(cudaFree(C_gpu[i]));
CHECK_CUDA(cudaStreamDestroy(streams[i]));
for (int j = 0; j < EVENTS_PER_GPU; j++) {
CHECK_CUDA(cudaEventDestroy(events[i][j]));
}
}
}