393 lines
11 KiB
Plaintext
393 lines
11 KiB
Plaintext
#include "convolution.h"
|
|
#include <mpi.h>
|
|
#include <stdio.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#include "util.h"
|
|
|
|
// NVIDIA GeForce RTX 3090
|
|
// - Computing capability: 8.6
|
|
// - Maximum number of threads per block: 1024
|
|
// - Maximum amount of shared memory per SM: 100KB
|
|
// - Maximum amount of shared memory per thread block : 99KB
|
|
// - Maximum amount of local memory per thread: 512KB
|
|
// Maximum x-dimension of a grid of thread blocks: 2^31-1
|
|
// Maximum y- or z-dimension of a grid of thread blocks: 65535
|
|
// - In a SM:
|
|
// 128 FP32 cores in a SM
|
|
// 64 INT32 cores for integer math
|
|
// 4 warp schedulers
|
|
|
|
|
|
|
|
#define MAX_NUM_GPU 4
|
|
#define MAX_THREAD_PER_BLOCK 8 // 16 x 16 = 256
|
|
|
|
#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
|
|
#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )
|
|
#define MIN(a,b) ( ((a) < (b)) ? (a) : (b) )
|
|
|
|
#define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
|
|
#define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
|
|
|
|
|
|
#define CUDA_CALL(d, f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \
|
|
mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
static float *input, *output, *filter;
|
|
static int N, C, H, W;
|
|
static int K, R, S;
|
|
static int OH, OW;
|
|
static int pad;
|
|
static int dilation;
|
|
static int stride;
|
|
static int mpi_rank, mpi_world_size;
|
|
static int num_devices;
|
|
|
|
// Array of device (GPU) pointers
|
|
static float *i_d[MAX_NUM_GPU];
|
|
static float *f_d[MAX_NUM_GPU];
|
|
static float *o_d[MAX_NUM_GPU];
|
|
|
|
__global__ void conv_core(float *in, float *flt, float *out,
|
|
int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
|
|
int stride, int pad, int dilation)
|
|
{
|
|
const int col = threadIdx.x;
|
|
const int row = threadIdx.y;
|
|
const int globalCol = blockDim.x * blockIdx.x + col;
|
|
const int globalRow = blockDim.y * blockIdx.y + row;
|
|
|
|
if (globalCol < OW && globalRow < OH)
|
|
{
|
|
for (int n = 0; n < N; ++n)
|
|
{
|
|
for (int k = 0; k < K; ++k)
|
|
{
|
|
float *outForK = &out[n * K * OH * OW + k * OH * OW];
|
|
|
|
float o = 0.0f;
|
|
for (int c = 0; c < C; ++c)
|
|
{
|
|
float *inForC = &in[n * C * H * W + c * H * W];
|
|
float *fltForC = &flt[k * C * R * S + c * R * S];
|
|
|
|
for (int r = 0; r < R; r++)
|
|
{
|
|
for (int s = 0; s < S; s++)
|
|
{
|
|
int h = globalRow * stride - pad + r * dilation;
|
|
int w = globalCol * stride - pad + s * dilation;
|
|
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
|
|
|
float i = inForC[h * W + w];
|
|
float f = fltForC[r * S + s];
|
|
|
|
o += i * f;
|
|
// printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
|
|
// globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
|
|
}
|
|
}
|
|
}
|
|
|
|
outForK[globalRow * OW + globalCol] = o;
|
|
|
|
CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
|
|
n, k, globalRow, globalCol,
|
|
&outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#define MPI_ASYNC 1
|
|
|
|
void convolution(
|
|
float *_input, float *_output, float *_filter,
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride)
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Request reqs[3] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL };
|
|
#endif
|
|
|
|
const int slicedN = _N / mpi_world_size;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// scatter
|
|
if (mpi_rank == 0)
|
|
{
|
|
input = _input;
|
|
output = _output;
|
|
filter = _filter;
|
|
}
|
|
|
|
if (slicedN > 0)
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Ibcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]);
|
|
#else
|
|
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
#endif
|
|
|
|
if (mpi_rank == 0)
|
|
{
|
|
const int sizeOfN = slicedN * C * H * W;
|
|
float *inputForNodes = input + (N * C * H * W);
|
|
|
|
for (int i = 1; i < mpi_world_size; i++)
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Isend(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &reqs[1]);
|
|
#else
|
|
MPI_Send(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
|
|
#endif
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Irecv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &reqs[1]);
|
|
#else
|
|
MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
|
#endif
|
|
|
|
// zero_tensor(output, N, K, OH, OW);
|
|
}
|
|
|
|
#if MPI_ASYNC == 1
|
|
MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
|
|
#endif
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// calc
|
|
int slicedK = K / num_devices;
|
|
int startK[MAX_NUM_GPU];
|
|
int sizeK[MAX_NUM_GPU];
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
startK[d] = slicedK * d;
|
|
sizeK[d] = slicedK;
|
|
}
|
|
sizeK[num_devices - 1] = K - startK[num_devices - 1];
|
|
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
if (N <= 0 || sizeK[d] <= 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
CUDA_CALL( d, cudaSetDevice(d) );
|
|
|
|
// Upload input data to every GPU
|
|
CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
|
|
mpi_rank, d, i_d[d], input, N * C * H * W * sizeof(float), N * C * H * W * sizeof(float));
|
|
|
|
CUDA_CALL( d, cudaMemcpyAsync(i_d[d], input,
|
|
N * C * H * W * sizeof(float),
|
|
cudaMemcpyHostToDevice) );
|
|
|
|
// Upload filter data to every GPU
|
|
CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
|
|
mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));
|
|
|
|
CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
|
|
sizeK[d] * C * R * S * sizeof(float),
|
|
cudaMemcpyHostToDevice) );
|
|
|
|
// Launch kernel on every GPU
|
|
dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
|
|
dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);
|
|
|
|
CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
|
|
", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
|
|
", stride=%d, pad=%d, dilation=%d\n",
|
|
mpi_rank, d, i_d[d], f_d[d], o_d[d],
|
|
N, sizeK[d], C, H, W, R, S, OH, OW,
|
|
stride, pad, dilation);
|
|
|
|
conv_core<<<gridDim, blockDim>>>(i_d[d], f_d[d], o_d[d],
|
|
N, sizeK[d], C, H, W, R, S, OH, OW,
|
|
stride, pad, dilation);
|
|
}
|
|
|
|
// Download output data from GPUs
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
if (N <= 0 || sizeK[d] <= 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
CUDA_CALL( d, cudaSetDevice(d) );
|
|
|
|
for (int n = 0; n < N; n++)
|
|
{
|
|
CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
|
|
mpi_rank, d, &output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW,
|
|
sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));
|
|
|
|
CUDA_CALL( d, cudaMemcpyAsync(&output[(n * K * OH * OW) + (startK[d] * OH * OW)],
|
|
o_d[d] + n * sizeK[d] * OH * OW,
|
|
sizeK[d] * OH * OW * sizeof(float),
|
|
cudaMemcpyDeviceToHost) );
|
|
}
|
|
}
|
|
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// gather
|
|
if (slicedN > 0)
|
|
{
|
|
if (mpi_rank == 0)
|
|
{
|
|
const int sizeOfN = slicedN * K * OH * OW;
|
|
float *outputForNodes = output + (N * K * OH * OW);
|
|
|
|
for (int i = 1; i < mpi_world_size; i++)
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Irecv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, &reqs[2]);
|
|
#else
|
|
MPI_Recv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
|
#endif
|
|
}
|
|
}
|
|
else
|
|
{
|
|
#if MPI_ASYNC == 1
|
|
MPI_Isend(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &reqs[2]);
|
|
#else
|
|
MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
|
|
#endif
|
|
}
|
|
|
|
#if MPI_ASYNC == 1
|
|
MPI_Wait(&reqs[2], MPI_STATUS_IGNORE);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
void convolution_init(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride)
|
|
{
|
|
C = _C; H = _H; W = _W;
|
|
K = _K; R = _R; S = _S;
|
|
pad = _pad;
|
|
dilation = _dilation;
|
|
stride = _stride;
|
|
|
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
|
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
|
|
|
N = _N / mpi_world_size;
|
|
|
|
if (mpi_rank == 0)
|
|
{
|
|
N = _N - N * (mpi_world_size - 1);
|
|
}
|
|
else
|
|
{
|
|
if (N > 0)
|
|
{
|
|
alloc_tensor(&input, N, C, H, W);
|
|
alloc_tensor(&filter, K, C, R, S);
|
|
alloc_tensor(&output, N, K, OH, OW);
|
|
}
|
|
}
|
|
|
|
CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
|
|
if (num_devices > MAX_NUM_GPU)
|
|
{
|
|
num_devices = MAX_NUM_GPU;
|
|
}
|
|
|
|
if (num_devices <= 0)
|
|
{
|
|
printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
|
|
exit(1);
|
|
}
|
|
|
|
if (N > 0)
|
|
{
|
|
// Allocate device memory for each GPU
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaSetDevice(d) );
|
|
|
|
CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256
|
|
CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16
|
|
CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
|
|
}
|
|
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaSetDevice(d) );
|
|
|
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
}
|
|
|
|
void convolution_final(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride)
|
|
{
|
|
// CUDA
|
|
if (N > 0)
|
|
{
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
|
}
|
|
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaSetDevice(d) );
|
|
|
|
CUDA_CALL( d, cudaFree(i_d[d]) );
|
|
CUDA_CALL( d, cudaFree(f_d[d]) );
|
|
CUDA_CALL( d, cudaFree(o_d[d]) );
|
|
}
|
|
|
|
for (int d = 0; d < num_devices; d++)
|
|
{
|
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
// MPI
|
|
MPI_Barrier(MPI_COMM_WORLD);
|
|
|
|
if (N > 0)
|
|
{
|
|
if (mpi_rank != 0)
|
|
{
|
|
free(input);
|
|
free(filter);
|
|
free(output);
|
|
}
|
|
}
|
|
}
|