190 lines
5.8 KiB
Plaintext
190 lines
5.8 KiB
Plaintext
#include "matmul.h"
|
|
#include "util.h"
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <mpi.h>
|
|
|
|
#define CUDA_CALL(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
#define MAX_NUM_GPU 4
|
|
int num_devices = 0;
|
|
|
|
static int mpi_rank, mpi_world_size;
|
|
static int Asendcounts[4];
|
|
static int Adispls[4];
|
|
static int Crecvcounts[4];
|
|
static int Cdispls[4];
|
|
|
|
// Array of device (GPU) pointers
|
|
static float *a_d[MAX_NUM_GPU];
|
|
static float *b_d[MAX_NUM_GPU];
|
|
static float *c_d[MAX_NUM_GPU];
|
|
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];
|
|
|
|
cudaStream_t streams[MAX_NUM_GPU];
|
|
|
|
#define BLOCK_SIZE 32
|
|
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
|
|
__global__ void matmul_kernel(float *A, float *B, float *C, int M, int N, int K) {
|
|
int j = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int i = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
int gj = blockIdx.x;
|
|
int gi = blockIdx.y;
|
|
|
|
if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check
|
|
|
|
int lj = threadIdx.x;
|
|
int li = threadIdx.y;
|
|
|
|
__shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
float c = 0.f;
|
|
|
|
int A_row_index = (gi * BLOCK_SIZE + li);
|
|
int B_col_index = (gj * BLOCK_SIZE + lj);
|
|
|
|
for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
|
|
int A_col_index = bk + lj;
|
|
Alocal[li][lj] = (A_row_index < M && A_col_index < K) ?
|
|
A[A_row_index * K + A_col_index] :
|
|
0.f;
|
|
|
|
int B_row_index = bk + li;
|
|
Blocal[li][lj] = (B_row_index < K && B_col_index < N) ?
|
|
B[B_row_index * N + B_col_index] :
|
|
0.f;
|
|
|
|
__syncthreads();
|
|
|
|
for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
|
|
c += Alocal[li][lk] * Blocal[lk][lj];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
|
|
if (i < M && j < N)
|
|
C[i * N + j] = c;
|
|
}
|
|
|
|
void matmul(const float *A, const float *B, float *C, int M, int N, int K) {
|
|
|
|
MPI_Scatterv(A, Asendcounts, Adispls,
|
|
MPI_FLOAT, (void*)A, Asendcounts[mpi_rank], MPI_FLOAT,
|
|
0, MPI_COMM_WORLD);
|
|
MPI_Bcast((void*)B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
|
|
// Upload A and B matrix to every GPU
|
|
#pragma omp parallel for
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL(cudaSetDevice(i));
|
|
|
|
CUDA_CALL(cudaMemcpyAsync(
|
|
a_d[i], A + Mbegin[i] * K,
|
|
(Mend[i] - Mbegin[i]) * K * sizeof(float),
|
|
cudaMemcpyHostToDevice,
|
|
streams[i]));
|
|
CUDA_CALL(cudaMemcpyAsync(b_d[i], B, K * N * sizeof(float),
|
|
cudaMemcpyHostToDevice, streams[i]));
|
|
|
|
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
|
|
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (Mend[i] - Mbegin[i] + BLOCK_SIZE-1) / BLOCK_SIZE);
|
|
|
|
matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(a_d[i], b_d[i], c_d[i], Mend[i] - Mbegin[i], N, K);
|
|
|
|
CUDA_CALL(cudaMemcpyAsync(C + Mbegin[i] * N, c_d[i],
|
|
(Mend[i] - Mbegin[i]) * N * sizeof(float),
|
|
cudaMemcpyDeviceToHost, streams[i]));
|
|
}
|
|
|
|
#pragma omp parallel for
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL(cudaSetDevice(i));
|
|
CUDA_CALL(cudaStreamSynchronize(streams[i]));
|
|
}
|
|
|
|
MPI_Gatherv(C, Crecvcounts[mpi_rank], MPI_FLOAT,
|
|
C, Crecvcounts, Cdispls, MPI_FLOAT,
|
|
0, MPI_COMM_WORLD);
|
|
}
|
|
|
|
void matmul_initialize(int M, int N, int K) {
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
|
|
|
for (int i = 0; i < mpi_world_size; i++) {
|
|
Adispls[i] = ((M / mpi_world_size) * K) * i;
|
|
Asendcounts[i] = ((M / mpi_world_size) * K);
|
|
Cdispls[i] = ((M / mpi_world_size) * N) * i;
|
|
Crecvcounts[i] = ((M / mpi_world_size) * N);
|
|
}
|
|
Asendcounts[mpi_world_size - 1] = M*K - Adispls[mpi_world_size-1];
|
|
Crecvcounts[mpi_world_size - 1] = M*N - Cdispls[mpi_world_size-1];
|
|
|
|
// Only root process do something
|
|
CUDA_CALL(cudaGetDeviceCount(&num_devices));
|
|
|
|
int num_global_devices = 0;
|
|
MPI_Reduce(&num_devices, (void*)&num_global_devices, 1, MPI_INT,
|
|
MPI_SUM, 0, MPI_COMM_WORLD);
|
|
|
|
if (mpi_rank == 0) {
|
|
printf("Using %d devices\n", num_devices);
|
|
}
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
for (int j = 0; j < mpi_world_size; ++j) {
|
|
if (mpi_rank == j) {
|
|
for (int i = 0; i < num_devices; i++) {
|
|
cudaDeviceProp prop;
|
|
CUDA_CALL(cudaGetDeviceProperties(&prop, i));
|
|
|
|
// Try printing more detailed information here
|
|
printf("[rank %d] GPU %d: %s\n", mpi_rank, i, prop.name);
|
|
}
|
|
}
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
}
|
|
|
|
if (num_devices <= 0) {
|
|
printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank);
|
|
exit(1);
|
|
}
|
|
|
|
// Setup problem size for each GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
Mbegin[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * i;
|
|
Mend[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * (i + 1);
|
|
}
|
|
Mend[num_devices - 1] = (Asendcounts[mpi_rank] / K);
|
|
|
|
// Allocate device memory for each GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL(cudaSetDevice(i));
|
|
CUDA_CALL(cudaMalloc(&a_d[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));
|
|
CUDA_CALL(cudaMalloc(&b_d[i], K * N * sizeof(float)));
|
|
CUDA_CALL(cudaMalloc(&c_d[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));
|
|
|
|
CUDA_CALL(cudaStreamCreate(&streams[i]));
|
|
}
|
|
}
|
|
|
|
void matmul_finalize() {
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL(cudaFree(a_d[i]));
|
|
CUDA_CALL(cudaFree(b_d[i]));
|
|
CUDA_CALL(cudaFree(c_d[i]));
|
|
|
|
CUDA_CALL(cudaStreamDestroy(streams[i]));
|
|
}
|
|
}
|