chundoong-lab-ta/SamsungDS22/submissions/final/yw0.kim/tmp-B/convolution.cu

472 lines
13 KiB
Plaintext

#include "convolution.h"
#include "util.h"
#include <mpi.h>
#include <cstdio>
#include <cuda_runtime.h>
#include <omp.h>
#define NAIVE 0
#define OPTIMIZED 1
#define PRINT_DEBUG 0
// #define KERNEL_VERSION NAIVE
#define KERNEL_VERSION OPTIMIZED
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) \
{ \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define MAX_NUM_GPU 4
#define NUM_THREADS_PER_NODE 40
#define CTILE_SIZE 8
#define RTILE_SIZE 16
#define STILE_SIZE 16
#define min(A, B) (((A) > (B)) ? (B) : (A))
__global__ void kernel_convolution_naive(
int mpi_rank,
float *input, float *output, float *filter,
int N, int C, int H, int W,
int K, int R, int S, int OH, int OW,
int pad, int dilation, int stride)
{
int oh = blockIdx.x;
int ow = blockIdx.y;
for (int n = 0; n < N; n++)
{
for (int k = 0; k < K; k++)
{
float o = 0.0;
for (int c = 0; c < C; c++)
{
for (int r = 0; r < R; r++)
{
for (int s = 0; s < S; s++)
{
int h = oh * stride - pad + r * dilation;
int w = ow * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W)
continue;
float i = input[n * C * H * W + c * H * W + h * W + w];
float f = filter[k * C * R * S + c * R * S + r * S + s];
o += i * f;
}
}
}
output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
}
}
}
__global__ void kernel_convolution(
int mpi_rank,
float *input, float *output, float *filter,
int N, int C, int H, int W,
int K, int R, int S, int OH, int OW,
int pad, int dilation, int stride)
{
int lr = threadIdx.y;
int ls = threadIdx.x;
int oh = blockIdx.x * RTILE_SIZE + lr;
int ow = blockIdx.y * STILE_SIZE + ls;
int n = blockIdx.z / K;
int k = blockIdx.z % K;
__shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE];
float o = 0.0;
for (int rtile = 0; rtile < R; rtile += RTILE_SIZE)
{
for (int stile = 0; stile < S; stile += STILE_SIZE)
{
for (int ctile = 0; ctile < C; ctile += CTILE_SIZE)
{
int r = rtile + lr;
int s = stile + ls;
int climit = min(ctile + CTILE_SIZE, C);
for (int c = ctile; c < climit; c++)
{
int lc = c % CTILE_SIZE;
if (r < R && s < S)
{
filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s];
}
else
{
filter_shared[lc][lr][ls] = 0.0;
}
}
__syncthreads();
for (int c = ctile; c < climit; c++)
{
for (int r = 0; r < RTILE_SIZE; r++)
{
for (int s = 0; s < STILE_SIZE; s++)
{
int lc = c % CTILE_SIZE;
int h = oh * stride - pad + (rtile + r) * dilation;
int w = ow * stride - pad + (stile + s) * dilation;
if (h < 0 || h >= H || w < 0 || w >= W)
continue;
float i = input[n * C * H * W + c * H * W + h * W + w];
float f = filter_shared[lc][r][s];
o += i * f;
}
}
}
__syncthreads();
}
}
}
if (oh >= 0 && oh < OH && ow >= 0 && ow < OW)
{
output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
}
}
static int num_devices = 0;
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int nstart, nend, nlen;
static MPI_Request request;
static MPI_Status status;
static float *input_d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];
static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU];
void convolution(
float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
if (mpi_rank == 0)
{
input = _input;
output = _output;
filter = _filter;
#if (PRINT_DEBUG == 1)
print_filter(filter, K, C, R, S);
print_input(input, nlen, C, H, W);
#endif
}
// printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen);
// cuda_device_malloc();
if (mpi_world_size == 1)
{
cuda_memcpy_host_to_device();
cuda_kernel_call();
cuda_memcpy_device_to_host();
}
else
{
if (mpi_rank == 0)
{
int dest = 1;
nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
nlen = nend - nstart;
MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
nlen = nend - nstart;
}
else
{
int source = 0;
MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
zero_tensor(output, nlen, K, OH, OW);
}
cuda_memcpy_host_to_device();
cuda_kernel_call();
cuda_memcpy_device_to_host();
if (mpi_rank == 0)
{
int source = 1;
nstart = N / mpi_world_size * source + min(source, N % mpi_world_size);
nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size);
nlen = nend - nstart;
MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
}
else
{
int dest = 0;
MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
}
}
}
void convolution_init(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
N = _N;
C = _C;
H = _H;
W = _W;
K = _K;
R = _R;
S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
omp_set_num_threads(NUM_THREADS_PER_NODE);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
if (mpi_world_size == 1)
{
// Only 1 node is usable
// Set start and end index for node #0
nstart = 0;
nend = N;
nlen = nend - nstart;
}
else
{
// 2 nodes are available
if (mpi_rank == 0)
{
int dest = 1;
nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
// Send start and end index to node #1
MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
nlen = nend - nstart;
}
else
{
int source = 0;
// Receive start and end index from node #0
MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
nlen = nend - nstart;
alloc_tensor(&input, nlen, C, H, W);
alloc_tensor(&output, nlen, K, OH, OW);
alloc_tensor(&filter, K, C, R, S);
}
}
cuda_device_init();
cuda_device_malloc();
}
void convolution_final(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
}
void cuda_device_init(void)
{
CUDA_CALL(cudaGetDeviceCount(&num_devices));
// num_devices = 1;
/// printf("Node #%d Using %d devices\n", mpi_rank, num_devices);
// for (int i = 0; i < num_devices; i++)
// {
// cudaDeviceProp prop;
// CUDA_CALL(cudaGetDeviceProperties(&prop, i));
// // Try printing more detailed information here
// // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name);
// }
if (num_devices <= 0)
{
printf("No CUDA device found. Aborting\n");
exit(1);
}
// Setup problem size for each GPU
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices);
Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices);
Nlen[i] = Nend[i] - Nbegin[i];
#if (PRINT_DEBUG == 1)
printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]);
#endif
}
}
void cuda_device_malloc(void)
{
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float)));
CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float)));
CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float)));
}
}
void cuda_memcpy_host_to_device(void)
{
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaMemcpy(input_d[i],
input + Nbegin[i] * C * H * W,
Nlen[i] * C * H * W * sizeof(float),
cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(filter_d[i],
filter,
K * C * R * S * sizeof(float),
cudaMemcpyHostToDevice));
}
}
void cuda_memcpy_device_to_host(void)
{
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW,
output_d[i],
Nlen[i] * K * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost));
}
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaDeviceSynchronize());
}
}
void cuda_kernel_call(void)
{
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
#if (KERNEL_VERSION == NAIVE)
// Naive Version
dim3 blockDim(1, 1);
dim3 gridDim(OH, OW);
#elif (KERNEL_VERSION == OPTIMIZED)
// Optimized Version
dim3 blockDim(STILE_SIZE, RTILE_SIZE);
dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K);
#endif
#if (PRINT_DEBUG == 1)
printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y);
printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z);
#endif
CUDA_CALL(cudaSetDevice(i));
#if (KERNEL_VERSION == NAIVE)
kernel_convolution_naive<<<gridDim, blockDim>>>(mpi_rank,
input_d[i], output_d[i], filter_d[i],
Nlen[i], C, H, W,
K, R, S, OH, OW,
pad, dilation, stride);
#elif (KERNEL_VERSION == OPTIMIZED)
kernel_convolution<<<gridDim, blockDim>>>(mpi_rank,
input_d[i], output_d[i], filter_d[i],
Nlen[i], C, H, W,
K, R, S, OH, OW,
pad, dilation, stride);
#endif
}
#pragma parallel for
for (int i = 0; i < num_devices; i++)
{
CUDA_CALL(cudaSetDevice(i));
CUDA_CALL(cudaDeviceSynchronize());
}
}
void print_filter(float *filter, int K, int C, int R, int S)
{
printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S);
for (int k = 0; k < K; k++)
{
for (int c = 0; c < C; c++)
{
printf("(k, c): (%d, %d)\n", k, c);
for (int r = 0; r < R; r++)
{
for (int s = 0; s < S; s++)
{
printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]);
}
printf("\n");
}
printf("\n\n");
}
}
}
void print_input(float *input, int N, int C, int H, int W)
{
printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W);
for (int n = 0; n < N; n++)
{
for (int c = 0; c < C; c++)
{
printf("(n, c): (%d, %d)\n", n, c);
for (int h = 0; h < H; h++)
{
for (int w = 0; w < W; w++)
{
printf("%f ", input[n * C * H * W + c * H * W + h * W + w]);
}
printf("\n");
}
printf("\n\n");
}
}
}