chundoong-lab-ta/SHPC2022/hw6_answer/matmul/matmul.cu

190 lines
5.8 KiB
Plaintext

#include "matmul.h"
#include "util.h"
#include <cuda_runtime.h>
#include <mpi.h>
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define MAX_NUM_GPU 4
int num_devices = 0;
static int mpi_rank, mpi_world_size;
static int Asendcounts[4];
static int Adispls[4];
static int Crecvcounts[4];
static int Cdispls[4];
// Array of device (GPU) pointers
static float *a_d[MAX_NUM_GPU];
static float *b_d[MAX_NUM_GPU];
static float *c_d[MAX_NUM_GPU];
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];
cudaStream_t streams[MAX_NUM_GPU];
#define BLOCK_SIZE 32
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
__global__ void matmul_kernel(float *A, float *B, float *C, int M, int N, int K) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
int gj = blockIdx.x;
int gi = blockIdx.y;
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
int lj = threadIdx.x;
int li = threadIdx.y;
__shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];
float c = 0.f;
int A_row_index = (gi * BLOCK_SIZE + li);
int B_col_index = (gj * BLOCK_SIZE + lj);
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
int A_col_index = bk + lj;
Alocal[li][lj] = (A_row_index < M && A_col_index < K) ?
A[A_row_index * K + A_col_index] :
0.f;
int B_row_index = bk + li;
Blocal[li][lj] = (B_row_index < K && B_col_index < N) ?
B[B_row_index * N + B_col_index] :
0.f;
__syncthreads();
for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
c += Alocal[li][lk] * Blocal[lk][lj];
}
__syncthreads();
}
if (i < M && j < N)
C[i * N + j] = c;
}
void matmul(const float *A, const float *B, float *C, int M, int N, int K) {
MPI_Scatterv(A, Asendcounts, Adispls,
MPI_FLOAT, (void*)A, Asendcounts[mpi_rank], MPI_FLOAT,
0, MPI_COMM_WORLD);
MPI_Bcast((void*)B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
// Upload A and B matrix to every GPU
#pragma omp parallel for
for (int i = 0; i < num_devices; i++) {
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaMemcpyAsync(
a_d[i], A + Mbegin[i] * K,
(Mend[i] - Mbegin[i]) * K * sizeof(float),
cudaMemcpyHostToDevice,
streams[i]));
CUDA_CALL(cudaMemcpyAsync(b_d[i], B, K * N * sizeof(float),
cudaMemcpyHostToDevice, streams[i]));
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (Mend[i] - Mbegin[i] + BLOCK_SIZE-1) / BLOCK_SIZE);
matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(a_d[i], b_d[i], c_d[i], Mend[i] - Mbegin[i], N, K);
CUDA_CALL(cudaMemcpyAsync(C + Mbegin[i] * N, c_d[i],
(Mend[i] - Mbegin[i]) * N * sizeof(float),
cudaMemcpyDeviceToHost, streams[i]));
}
#pragma omp parallel for
for (int i = 0; i < num_devices; i++) {
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaStreamSynchronize(streams[i]));
}
MPI_Gatherv(C, Crecvcounts[mpi_rank], MPI_FLOAT,
C, Crecvcounts, Cdispls, MPI_FLOAT,
0, MPI_COMM_WORLD);
}
void matmul_initialize(int M, int N, int K) {
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
for (int i = 0; i < mpi_world_size; i++) {
Adispls[i] = ((M / mpi_world_size) * K) * i;
Asendcounts[i] = ((M / mpi_world_size) * K);
Cdispls[i] = ((M / mpi_world_size) * N) * i;
Crecvcounts[i] = ((M / mpi_world_size) * N);
}
Asendcounts[mpi_world_size - 1] = M*K - Adispls[mpi_world_size-1];
Crecvcounts[mpi_world_size - 1] = M*N - Cdispls[mpi_world_size-1];
// Only root process do something
CUDA_CALL(cudaGetDeviceCount(&num_devices));
int num_global_devices = 0;
MPI_Reduce(&num_devices, (void*)&num_global_devices, 1, MPI_INT,
MPI_SUM, 0, MPI_COMM_WORLD);
if (mpi_rank == 0) {
printf("Using %d devices\n", num_devices);
}
MPI_Barrier(MPI_COMM_WORLD);
for (int j = 0; j < mpi_world_size; ++j) {
if (mpi_rank == j) {
for (int i = 0; i < num_devices; i++) {
cudaDeviceProp prop;
CUDA_CALL(cudaGetDeviceProperties(&prop, i));
// Try printing more detailed information here
printf("[rank %d] GPU %d: %s\n", mpi_rank, i, prop.name);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
if (num_devices <= 0) {
printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank);
exit(1);
}
// Setup problem size for each GPU
for (int i = 0; i < num_devices; i++) {
Mbegin[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * i;
Mend[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * (i + 1);
}
Mend[num_devices - 1] = (Asendcounts[mpi_rank] / K);
// Allocate device memory for each GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaMalloc(&a_d[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));
CUDA_CALL(cudaMalloc(&b_d[i], K * N * sizeof(float)));
CUDA_CALL(cudaMalloc(&c_d[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));
CUDA_CALL(cudaStreamCreate(&streams[i]));
}
}
void matmul_finalize() {
for (int i = 0; i < num_devices; i++) {
CUDA_CALL(cudaFree(a_d[i]));
CUDA_CALL(cudaFree(b_d[i]));
CUDA_CALL(cudaFree(c_d[i]));
CUDA_CALL(cudaStreamDestroy(streams[i]));
}
}