chundoong-lab-ta/SamsungDS22/submissions/final/jb114.seo/tmp-B/convolution_6252.cu

426 lines
12 KiB
Plaintext
Raw Normal View History

2022-09-29 18:01:45 +09:00
#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "util.h"
// NVIDIA GeForce RTX 3090
// - Computing capability: 8.6
// - Maximum number of threads per block: 1024
// - Maximum amount of shared memory per SM: 100KB
// - Maximum amount of shared memory per thread block : 99KB
// - Maximum amount of local memory per thread: 512KB
// Maximum x-dimension of a grid of thread blocks: 2^31-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// - In a SM:
// 128 FP32 cores in a SM
// 64 INT32 cores for integer math
// 4 warp schedulers
#define MAX_NUM_GPU 4
#define MAX_THREAD_PER_BLOCK 8 // 16 x 16 = 256
#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )
#define MIN(a,b) ( ((a) < (b)) ? (a) : (b) )
#define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
#define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)
#define CUDA_CALL(d, f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \
mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \
exit(1); \
} \
}
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int num_devices;
static int *startN;
static int *sizeN;
// Array of device (GPU) pointers
static float *i_d[MAX_NUM_GPU];
static float *f_d[MAX_NUM_GPU];
static float *o_d[MAX_NUM_GPU];
__global__ void conv_core(float *in, float *flt, float *out,
int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
int stride, int pad, int dilation)
{
const int col = threadIdx.x;
const int row = threadIdx.y;
const int globalCol = blockDim.x * blockIdx.x + col;
const int globalRow = blockDim.y * blockIdx.y + row;
if (globalCol < OW && globalRow < OH)
{
for (int n = 0; n < N; ++n)
{
for (int k = 0; k < K; ++k)
{
float *outForK = &out[n * K * OH * OW + k * OH * OW];
float o = 0.0f;
for (int c = 0; c < C; ++c)
{
float *inForC = &in[n * C * H * W + c * H * W];
float *fltForC = &flt[k * C * R * S + c * R * S];
for (int r = 0; r < R; r++)
{
for (int s = 0; s < S; s++)
{
int h = globalRow * stride - pad + r * dilation;
int w = globalCol * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
float i = inForC[h * W + w];
float f = fltForC[r * S + s];
o += i * f;
// printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
// globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
}
}
}
outForK[globalRow * OW + globalCol] = o;
CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
n, k, globalRow, globalCol,
&outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
}
}
}
}
void cuda_alloc(float **t, int D0, int D1, int D2, int D3)
{
CUDA_CALL( 0, cudaHostAlloc((void **) t, sizeof(float) * D0 * D1 * D2 * D3, cudaHostAllocDefault) );
if (*t == NULL)
{
printf("Failed to allocate memory for matrix.\n");
exit(0);
}
}
void cuda_free(float *t)
{
CUDA_CALL( 0, cudaFreeHost(t) );
}
void convolution(
float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
if (mpi_rank == 0)
{
input = _input;
output = _output;
filter = _filter;
// CUDA_CALL( 0, cudaHostRegister(input, sizeof(float) * _N * C * H * W, cudaHostRegisterDefault) );
// CUDA_CALL( 0, cudaHostRegister(output, sizeof(float) * _N * K * OH * OW, cudaHostRegisterDefault) );
// CUDA_CALL( 0, cudaHostRegister(filter, sizeof(float) * K * C * R * S, cudaHostRegisterDefault) );
}
////////////////////////////////////////////////////////////////////////////////
// scatter
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
if (mpi_rank == 0)
{
for (int i = 1; i < mpi_world_size; i++)
{
const int sizeOfN = sizeN[i] * C * H * W;
float *inputForNodes = input + (startN[i] * C * H * W);
if (sizeOfN == 0)
{
continue;
}
MPI_Send(inputForNodes, sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
}
}
else
{
if (N > 0)
{
MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
}
////////////////////////////////////////////////////////////////////////////////
// calc
int slicedK = K / num_devices;
int startK[MAX_NUM_GPU];
int sizeK[MAX_NUM_GPU];
for (int d = 0; d < num_devices; d++)
{
startK[d] = slicedK * d;
sizeK[d] = slicedK;
}
sizeK[num_devices - 1] = K - startK[num_devices - 1];
for (int d = 0; d < num_devices; d++)
{
if (N <= 0 || sizeK[d] <= 0)
{
continue;
}
CUDA_CALL( d, cudaSetDevice(d) );
// Upload input data to every GPU
CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
mpi_rank, d, i_d[d], input, N * C * H * W * sizeof(float), N * C * H * W * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(i_d[d], input,
N * C * H * W * sizeof(float),
cudaMemcpyHostToDevice) );
// Upload filter data to every GPU
CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
sizeK[d] * C * R * S * sizeof(float),
cudaMemcpyHostToDevice) );
// Launch kernel on every GPU
dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);
CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
", stride=%d, pad=%d, dilation=%d\n",
mpi_rank, d, i_d[d], f_d[d], o_d[d],
N, sizeK[d], C, H, W, R, S, OH, OW,
stride, pad, dilation);
conv_core<<<gridDim, blockDim>>>(i_d[d], f_d[d], o_d[d],
N, sizeK[d], C, H, W, R, S, OH, OW,
stride, pad, dilation);
}
// Download output data from GPUs
for (int d = 0; d < num_devices; d++)
{
if (N <= 0 || sizeK[d] <= 0)
{
continue;
}
CUDA_CALL( d, cudaSetDevice(d) );
for (int n = 0; n < N; n++)
{
CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
mpi_rank, d, &output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW,
sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));
CUDA_CALL( d, cudaMemcpyAsync(&output[(n * K * OH * OW) + (startK[d] * OH * OW)],
o_d[d] + n * sizeK[d] * OH * OW,
sizeK[d] * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost) );
}
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaDeviceSynchronize() );
}
////////////////////////////////////////////////////////////////////////////////
// gather
if (mpi_rank == 0)
{
for (int i = 1; i < mpi_world_size; i++)
{
const int sizeOfN = sizeN[i] * K * OH * OW;
float *outputForNodes = output + (startN[i] * K * OH * OW);
if (sizeOfN == 0)
{
continue;
}
MPI_Recv(outputForNodes, sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
}
else
{
if (N > 0)
{
MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
}
}
if (mpi_rank == 0)
{
// CUDA_CALL( 0, cudaHostUnregister(input) );
// CUDA_CALL( 0, cudaHostUnregister(output) );
// CUDA_CALL( 0, cudaHostUnregister(filter) );
}
}
void convolution_init(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
C = _C; H = _H; W = _W;
K = _K; R = _R; S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
alloc_tensor((float**)&startN, mpi_world_size, 1, 1, 1);
alloc_tensor((float**)&sizeN, mpi_world_size, 1, 1, 1);
int slicedN = (_N <= mpi_world_size) ? 1 : ((_N * 3) / (mpi_world_size * 4));
int offsetN = 0;
for (int i = 0; i < mpi_world_size; i++)
{
startN[i] = offsetN;
sizeN[i] = MIN(slicedN, _N - startN[i]);
offsetN += sizeN[i];
}
sizeN[mpi_world_size - 1] = _N - startN[mpi_world_size - 1];
N = sizeN[mpi_rank];
if (mpi_rank == 0)
{
}
else
{
if (N > 0)
{
cuda_alloc(&input, N, C, H, W);
cuda_alloc(&output, N, K, OH, OW);
}
cuda_alloc(&filter, K, C, R, S);
}
CUDA_LOG("[Node %d] N = %d, _N = %d, sclicedN = %d, startN=%d, sizeN=%d\n",
mpi_rank, N, _N, slicedN, startN[mpi_rank], sizeN[mpi_rank]);
CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
if (num_devices > MAX_NUM_GPU)
{
num_devices = MAX_NUM_GPU;
}
if (num_devices <= 0)
{
printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
exit(1);
}
if (N > 0)
{
// Allocate device memory for each GPU
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256
CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16
CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaDeviceSynchronize() );
}
}
}
void convolution_final(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
// CUDA
if (N > 0)
{
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaDeviceSynchronize() );
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaFree(i_d[d]) );
CUDA_CALL( d, cudaFree(f_d[d]) );
CUDA_CALL( d, cudaFree(o_d[d]) );
}
for (int d = 0; d < num_devices; d++)
{
CUDA_CALL( d, cudaSetDevice(d) );
CUDA_CALL( d, cudaDeviceSynchronize() );
}
}
// MPI
MPI_Barrier(MPI_COMM_WORLD);
if (mpi_rank == 0)
{
}
else
{
if (N > 0)
{
cuda_free(input);
cuda_free(output);
}
cuda_free(filter);
}
free(startN);
free(sizeN);
}