460 lines
12 KiB
Plaintext
460 lines
12 KiB
Plaintext
|
#include "convolution.h"
|
||
|
#include <mpi.h>
|
||
|
#include <stdio.h>
|
||
|
#include <cuda_runtime.h>
|
||
|
|
||
|
#include "util.h"
|
||
|
|
||
|
// NVIDIA GeForce RTX 3090
|
||
|
// - Computing capability: 8.6
|
||
|
// - Maximum number of threads per block: 1024
|
||
|
// - Maximum amount of shared memory per SM: 100KB
|
||
|
// - Maximum amount of shared memory per thread block : 99KB
|
||
|
// - Maximum amount of local memory per thread: 512KB
|
||
|
// Maximum x-dimension of a grid of thread blocks: 2^31-1
|
||
|
// Maximum y- or z-dimension of a grid of thread blocks: 65535
|
||
|
// - In a SM:
|
||
|
// 128 FP32 cores in a SM
|
||
|
// 64 INT32 cores for integer math
|
||
|
// 4 warp schedulers
|
||
|
|
||
|
|
||
|
|
||
|
#define MAX_NUM_GPU 4
|
||
|
#define MAX_THREAD_PER_BLOCK 8
|
||
|
#define MAX_NUM_STREAM 4
|
||
|
|
||
|
#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
|
||
|
#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )
|
||
|
#define MIN(a,b) ( ((a) < (b)) ? (a) : (b) )
|
||
|
|
||
|
#define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
|
||
|
#define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
|
||
|
|
||
|
|
||
|
#define CUDA_CALL(d, f) \
|
||
|
{ \
|
||
|
cudaError_t err = (f); \
|
||
|
if (err != cudaSuccess) { \
|
||
|
fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \
|
||
|
mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \
|
||
|
exit(1); \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
static float *input, *output, *filter;
|
||
|
static int N, C, H, W;
|
||
|
static int K, R, S;
|
||
|
static int OH, OW;
|
||
|
static int pad;
|
||
|
static int dilation;
|
||
|
static int stride;
|
||
|
static int mpi_rank, mpi_world_size;
|
||
|
static int num_devices;
|
||
|
|
||
|
static int *startN;
|
||
|
static int *sizeN;
|
||
|
|
||
|
// Array of device (GPU) pointers
|
||
|
static float *i_d[MAX_NUM_GPU];
|
||
|
static float *f_d[MAX_NUM_GPU];
|
||
|
static float *o_d[MAX_NUM_GPU];
|
||
|
|
||
|
cudaStream_t streams[MAX_NUM_GPU][MAX_NUM_STREAM];
|
||
|
|
||
|
__global__ void conv_core(float *in, float *flt, float *out,
|
||
|
int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
|
||
|
int stride, int pad, int dilation)
|
||
|
{
|
||
|
const int col = threadIdx.x;
|
||
|
const int row = threadIdx.y;
|
||
|
const int globalCol = blockDim.x * blockIdx.x + col;
|
||
|
const int globalRow = blockDim.y * blockIdx.y + row;
|
||
|
|
||
|
if (globalCol < OW && globalRow < OH)
|
||
|
{
|
||
|
for (int n = 0; n < N; ++n)
|
||
|
{
|
||
|
for (int k = 0; k < K; ++k)
|
||
|
{
|
||
|
float *outForK = &out[n * K * OH * OW + k * OH * OW];
|
||
|
|
||
|
float o = 0.0f;
|
||
|
for (int c = 0; c < C; ++c)
|
||
|
{
|
||
|
float *inForC = &in[n * C * H * W + c * H * W];
|
||
|
float *fltForC = &flt[k * C * R * S + c * R * S];
|
||
|
|
||
|
for (int r = 0; r < R; r++)
|
||
|
{
|
||
|
for (int s = 0; s < S; s++)
|
||
|
{
|
||
|
int h = globalRow * stride - pad + r * dilation;
|
||
|
int w = globalCol * stride - pad + s * dilation;
|
||
|
if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
||
|
|
||
|
float i = inForC[h * W + w];
|
||
|
float f = fltForC[r * S + s];
|
||
|
|
||
|
o += i * f;
|
||
|
// printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
|
||
|
// globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
outForK[globalRow * OW + globalCol] = o;
|
||
|
|
||
|
CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
|
||
|
n, k, globalRow, globalCol,
|
||
|
&outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_alloc(float **t, int D0, int D1, int D2, int D3)
|
||
|
{
|
||
|
CUDA_CALL( 0, cudaHostAlloc((void **) t, sizeof(float) * D0 * D1 * D2 * D3, cudaHostAllocDefault) );
|
||
|
|
||
|
if (*t == NULL)
|
||
|
{
|
||
|
printf("Failed to allocate memory for matrix.\n");
|
||
|
exit(0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_free(float *t)
|
||
|
{
|
||
|
CUDA_CALL( 0, cudaFreeHost(t) );
|
||
|
}
|
||
|
|
||
|
void convolution(
|
||
|
float *_input, float *_output, float *_filter,
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
input = _input;
|
||
|
output = _output;
|
||
|
filter = _filter;
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// scatter
|
||
|
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
||
|
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
|
||
|
for (int i = 1; i < mpi_world_size; i++)
|
||
|
{
|
||
|
const int sizeOfN = sizeN[i] * C * H * W;
|
||
|
float *inputForNodes = input + (startN[i] * C * H * W);
|
||
|
|
||
|
if (sizeOfN == 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
MPI_Send(inputForNodes, sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (N > 0)
|
||
|
{
|
||
|
MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// calc
|
||
|
int slicedK = K / num_devices;
|
||
|
int startK[MAX_NUM_GPU];
|
||
|
int sizeK[MAX_NUM_GPU];
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
startK[d] = slicedK * d;
|
||
|
sizeK[d] = slicedK;
|
||
|
}
|
||
|
sizeK[num_devices - 1] = K - startK[num_devices - 1];
|
||
|
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
int streamID = 0;
|
||
|
|
||
|
if (N <= 0 || sizeK[d] <= 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
// Upload filter data to every GPU
|
||
|
CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
|
||
|
mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));
|
||
|
|
||
|
CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
|
||
|
sizeK[d] * C * R * S * sizeof(float),
|
||
|
cudaMemcpyHostToDevice, streams[d][streamID]) );
|
||
|
}
|
||
|
|
||
|
for (int streamN = 0; streamN < N; streamN++)
|
||
|
{
|
||
|
int streamID = streamN % MAX_NUM_STREAM;
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
if (N <= 0 || sizeK[d] <= 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
// Upload input data to every GPU
|
||
|
CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
|
||
|
mpi_rank, d,
|
||
|
i_d[d] + streamN * C * H * W, input + streamN * C * H * W,
|
||
|
C * H * W * sizeof(float), C * H * W * sizeof(float));
|
||
|
|
||
|
CUDA_CALL( d, cudaMemcpyAsync(i_d[d] + streamN * C * H * W, input + streamN * C * H * W,
|
||
|
C * H * W * sizeof(float),
|
||
|
cudaMemcpyHostToDevice, streams[d][streamID]) );
|
||
|
}
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
if (N <= 0 || sizeK[d] <= 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
// Launch kernel on every GPU
|
||
|
dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
|
||
|
dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);
|
||
|
|
||
|
CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
|
||
|
", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
|
||
|
", stride=%d, pad=%d, dilation=%d\n",
|
||
|
mpi_rank, d,
|
||
|
i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,
|
||
|
1, sizeK[d], C, H, W, R, S, OH, OW,
|
||
|
stride, pad, dilation);
|
||
|
|
||
|
conv_core<<<gridDim, blockDim, 0, streams[d][streamID]>>>(
|
||
|
i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,
|
||
|
1, sizeK[d], C, H, W, R, S, OH, OW,
|
||
|
stride, pad, dilation);
|
||
|
}
|
||
|
|
||
|
// Download output data from GPUs
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
if (N <= 0 || sizeK[d] <= 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
// for (int n = 0; n < N; n++)
|
||
|
{
|
||
|
CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
|
||
|
mpi_rank, d,
|
||
|
output + (streamN * K * OH * OW) + (startK[d] * OH * OW), o_d[d] + streamN * sizeK[d] * OH * OW,
|
||
|
sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));
|
||
|
|
||
|
CUDA_CALL( d, cudaMemcpyAsync(output + (streamN * K * OH * OW) + (startK[d] * OH * OW),
|
||
|
o_d[d] + streamN * sizeK[d] * OH * OW,
|
||
|
sizeK[d] * OH * OW * sizeof(float),
|
||
|
cudaMemcpyDeviceToHost, streams[d][streamID]) );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
||
|
}
|
||
|
|
||
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
// gather
|
||
|
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
for (int i = 1; i < mpi_world_size; i++)
|
||
|
{
|
||
|
const int sizeOfN = sizeN[i] * K * OH * OW;
|
||
|
float *outputForNodes = output + (startN[i] * K * OH * OW);
|
||
|
|
||
|
if (sizeOfN == 0)
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
MPI_Recv(outputForNodes, sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (N > 0)
|
||
|
{
|
||
|
MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void convolution_init(
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
C = _C; H = _H; W = _W;
|
||
|
K = _K; R = _R; S = _S;
|
||
|
pad = _pad;
|
||
|
dilation = _dilation;
|
||
|
stride = _stride;
|
||
|
|
||
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
||
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
||
|
|
||
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
||
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
||
|
|
||
|
alloc_tensor((float**)&startN, mpi_world_size, 1, 1, 1);
|
||
|
alloc_tensor((float**)&sizeN, mpi_world_size, 1, 1, 1);
|
||
|
|
||
|
int slicedN = (_N <= mpi_world_size) ? 1 : ((_N * 3) / (mpi_world_size * 4));
|
||
|
int offsetN = 0;
|
||
|
for (int i = 0; i < mpi_world_size; i++)
|
||
|
{
|
||
|
startN[i] = offsetN;
|
||
|
sizeN[i] = MIN(slicedN, _N - startN[i]);
|
||
|
|
||
|
offsetN += sizeN[i];
|
||
|
}
|
||
|
sizeN[mpi_world_size - 1] = _N - startN[mpi_world_size - 1];
|
||
|
N = sizeN[mpi_rank];
|
||
|
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (N > 0)
|
||
|
{
|
||
|
cuda_alloc(&input, N, C, H, W);
|
||
|
cuda_alloc(&output, N, K, OH, OW);
|
||
|
}
|
||
|
|
||
|
cuda_alloc(&filter, K, C, R, S);
|
||
|
}
|
||
|
|
||
|
CUDA_LOG("[Node %d] N = %d, _N = %d, sclicedN = %d, startN=%d, sizeN=%d\n",
|
||
|
mpi_rank, N, _N, slicedN, startN[mpi_rank], sizeN[mpi_rank]);
|
||
|
|
||
|
CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
|
||
|
if (num_devices > MAX_NUM_GPU)
|
||
|
{
|
||
|
num_devices = MAX_NUM_GPU;
|
||
|
}
|
||
|
|
||
|
if (num_devices <= 0)
|
||
|
{
|
||
|
printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
if (N > 0)
|
||
|
{
|
||
|
// Allocate device memory for each GPU
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256
|
||
|
CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16
|
||
|
CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
|
||
|
}
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
for (int s = 0; s < MAX_NUM_STREAM; s++)
|
||
|
{
|
||
|
cudaStreamCreate(&streams[d][s]);
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void convolution_final(
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
// CUDA
|
||
|
if (N > 0)
|
||
|
{
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
||
|
}
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
CUDA_CALL( d, cudaFree(i_d[d]) );
|
||
|
CUDA_CALL( d, cudaFree(f_d[d]) );
|
||
|
CUDA_CALL( d, cudaFree(o_d[d]) );
|
||
|
}
|
||
|
|
||
|
for (int d = 0; d < num_devices; d++)
|
||
|
{
|
||
|
CUDA_CALL( d, cudaSetDevice(d) );
|
||
|
|
||
|
for (int s = 0; s < MAX_NUM_STREAM; s++)
|
||
|
{
|
||
|
cudaStreamDestroy(streams[d][s]);
|
||
|
}
|
||
|
|
||
|
CUDA_CALL( d, cudaDeviceSynchronize() );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// MPI
|
||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||
|
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (N > 0)
|
||
|
{
|
||
|
cuda_free(input);
|
||
|
cuda_free(output);
|
||
|
}
|
||
|
|
||
|
cuda_free(filter);
|
||
|
}
|
||
|
|
||
|
free(startN);
|
||
|
free(sizeN);
|
||
|
}
|