chundoong-lab-ta/SamsungDS22/submissions/final/jb114.seo/B/convolution_5580.cu.

393 lines
11 KiB
Plaintext
Raw Normal View History

2022-09-29 18:01:45 +09:00
#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "util.h"
// NVIDIA GeForce RTX 3090
// - Computing capability: 8.6
// - Maximum number of threads per block: 1024
// - Maximum amount of shared memory per SM: 100KB
// - Maximum amount of shared memory per thread block : 99KB
// - Maximum amount of local memory per thread: 512KB
// Maximum x-dimension of a grid of thread blocks: 2^31-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// - In a SM:
// 128 FP32 cores in a SM
// 64 INT32 cores for integer math
// 4 warp schedulers
#define MAX_NUM_GPU 4
#define MAX_THREAD_PER_BLOCK 8 // 16 x 16 = 256
#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )
#define MIN(a,b) ( ((a) < (b)) ? (a) : (b) )
#define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
#define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
#define CUDA_CALL(d, f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \
mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \
exit(1); \
} \
}
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int num_devices;
// Array of device (GPU) pointers
static float *i_d[MAX_NUM_GPU];
static float *f_d[MAX_NUM_GPU];
static float *o_d[MAX_NUM_GPU];
__global__ void conv_core(float *in, float *flt, float *out,
int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
int stride, int pad, int dilation)
{
const int col = threadIdx.x;
const int row = threadIdx.y;
const int globalCol = blockDim.x * blockIdx.x + col;
const int globalRow = blockDim.y * blockIdx.y + row;
if (globalCol < OW && globalRow < OH)
{
for (int n = 0; n < N; ++n)
{
for (int k = 0; k < K; ++k)
{
float *outForK = &out[n * K * OH * OW + k * OH * OW];
float o = 0.0f;
for (int c = 0; c < C; ++c)
{
float *inForC = &in[n * C * H * W + c * H * W];
float *fltForC = &flt[k * C * R * S + c * R * S];
for (int r = 0; r < R; r++)
{
for (int s = 0; s < S; s++)
{
int h = globalRow * stride - pad + r * dilation;
int w = globalCol * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
float i = inForC[h * W + w];
float f = fltForC[r * S + s];
o += i * f;
// printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
// globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
}
}
}
outForK[globalRow * OW + globalCol] = o;
CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
n, k, globalRow, globalCol,
&outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
}
}
}
}
#define MPI_ASYNC 1
void convolution(
float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
#if MPI_ASYNC == 1
MPI_Request reqs[3] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL };
#endif
const int slicedN = _N / mpi_world_size;
////////////////////////////////////////////////////////////////////////////////
// scatter
if (mpi_rank == 0)
{
input = _input;
output = _output;
filter = _filter;
}
if (slicedN > 0)
{
#if MPI_ASYNC == 1
MPI_Ibcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]);
#else
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif
if (mpi_rank == 0)
{
const int sizeOfN = slicedN * C * H * W;
float *inputForNodes = input + (N * C * H * W);
for (int i = 1; i < mpi_world_size; i++)
{
#if MPI_ASYNC == 1
MPI_Isend(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &reqs[1]);
#else
MPI_Send(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
#endif
}
}
else
{
#if MPI_ASYNC == 1
MPI_Irecv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &reqs[1]);
#else
MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#endif
// zero_tensor(output, N, K, OH, OW);
}
#if MPI_ASYNC == 1
MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
#endif
}
////////////////////////////////////////////////////////////////////////////////
// calc
int slicedK = K / num_devices;
int startK[MAX_NUM_GPU];
int sizeK[MAX_NUM_GPU];
for (int d = 0; d < num_devices; d++)
{
startK[d] = slicedK * d;
sizeK[d] = slicedK;
}
sizeK[num_devices - 1] = K - startK[num_devices - 1];
for (int d = 0; d < num_devices; d++)
{
if (N <= 0 || sizeK[d] <= 0)
{
continue;
}
CUDA_CALL( d, cudaSetDevice(d) );
// Upload input data to every GPU
CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
mpi_rank, d, i_d[d], input, N * C * H * W * sizeof(float), N * C * H * W * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(i_d[d], input,
N * C * H * W * sizeof(float),
cudaMemcpyHostToDevice) );
// Upload filter data to every GPU
CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
sizeK[d] * C * R * S * sizeof(float),
cudaMemcpyHostToDevice) );
// Launch kernel on every GPU
dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);
CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
", stride=%d, pad=%d, dilation=%d\n",
mpi_rank, d, i_d[d], f_d[d], o_d[d],
N, sizeK[d], C, H, W, R, S, OH, OW,
stride, pad, dilation);
conv_core<<<gridDim, blockDim>>>(i_d[d], f_d[d], o_d[d],
N, sizeK[d], C, H, W, R, S, OH, OW,
stride, pad, dilation);
}
// Download output data from GPUs
for (int d = 0; d < num_devices; d++)
{
if (N <= 0 || sizeK[d] <= 0)
{
continue;
}
CUDA_CALL( d, cudaSetDevice(d) );
for (int n = 0; n < N; n++)
{
CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
mpi_rank, d, &output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW,
sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(&output[(n * K * OH * OW) + (startK[d] * OH * OW)],
o_d[d] + n * sizeK[d] * OH * OW,
sizeK[d] * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost) );
}
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////////////
// gather
if (slicedN > 0)
{
if (mpi_rank == 0)
{
const int sizeOfN = slicedN * K * OH * OW;
float *outputForNodes = output + (N * K * OH * OW);
for (int i = 1; i < mpi_world_size; i++)
{
#if MPI_ASYNC == 1
MPI_Irecv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, &reqs[2]);
#else
MPI_Recv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#endif
}
}
else
{
#if MPI_ASYNC == 1
MPI_Isend(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &reqs[2]);
#else
MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
#endif
}
#if MPI_ASYNC == 1
MPI_Wait(&reqs[2], MPI_STATUS_IGNORE);
#endif
}
}
void convolution_init(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
C = _C; H = _H; W = _W;
K = _K; R = _R; S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
N = _N / mpi_world_size;
if (mpi_rank == 0)
{
N = _N - N * (mpi_world_size - 1);
}
else
{
if (N > 0)
{
alloc_tensor(&input, N, C, H, W);
alloc_tensor(&filter, K, C, R, S);
alloc_tensor(&output, N, K, OH, OW);
}
}
CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
if (num_devices > MAX_NUM_GPU)
{
num_devices = MAX_NUM_GPU;
}
if (num_devices <= 0)
{
printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
exit(1);
}
if (N > 0)
{
// Allocate device memory for each GPU
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256
CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16
CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaDeviceSynchronize() );
}
}
}
void convolution_final(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
// CUDA
if (N > 0)
{
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaDeviceSynchronize() );
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaFree(i_d[d]) );
CUDA_CALL( d, cudaFree(f_d[d]) );
CUDA_CALL( d, cudaFree(o_d[d]) );
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaDeviceSynchronize() );
}
}
// MPI
MPI_Barrier(MPI_COMM_WORLD);
if (N > 0)
{
if (mpi_rank != 0)
{
free(input);
free(filter);
free(output);
}
}
}