472 lines
13 KiB
Plaintext
472 lines
13 KiB
Plaintext
|
#include "convolution.h"
|
||
|
#include "util.h"
|
||
|
#include <mpi.h>
|
||
|
#include <cstdio>
|
||
|
#include <cuda_runtime.h>
|
||
|
#include <omp.h>
|
||
|
|
||
|
#define NAIVE 0
|
||
|
#define OPTIMIZED 1
|
||
|
|
||
|
#define PRINT_DEBUG 0
|
||
|
// #define KERNEL_VERSION NAIVE
|
||
|
#define KERNEL_VERSION OPTIMIZED
|
||
|
|
||
|
#define CUDA_CALL(f) \
|
||
|
{ \
|
||
|
cudaError_t err = (f); \
|
||
|
if (err != cudaSuccess) \
|
||
|
{ \
|
||
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
||
|
err, cudaGetErrorString(err)); \
|
||
|
exit(1); \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
#define MAX_NUM_GPU 4
|
||
|
#define NUM_THREADS_PER_NODE 40
|
||
|
#define CTILE_SIZE 8
|
||
|
#define RTILE_SIZE 16
|
||
|
#define STILE_SIZE 16
|
||
|
|
||
|
#define min(A, B) (((A) > (B)) ? (B) : (A))
|
||
|
|
||
|
__global__ void kernel_convolution_naive(
|
||
|
int mpi_rank,
|
||
|
float *input, float *output, float *filter,
|
||
|
int N, int C, int H, int W,
|
||
|
int K, int R, int S, int OH, int OW,
|
||
|
int pad, int dilation, int stride)
|
||
|
{
|
||
|
int oh = blockIdx.x;
|
||
|
int ow = blockIdx.y;
|
||
|
|
||
|
for (int n = 0; n < N; n++)
|
||
|
{
|
||
|
for (int k = 0; k < K; k++)
|
||
|
{
|
||
|
float o = 0.0;
|
||
|
for (int c = 0; c < C; c++)
|
||
|
{
|
||
|
for (int r = 0; r < R; r++)
|
||
|
{
|
||
|
for (int s = 0; s < S; s++)
|
||
|
{
|
||
|
int h = oh * stride - pad + r * dilation;
|
||
|
int w = ow * stride - pad + s * dilation;
|
||
|
if (h < 0 || h >= H || w < 0 || w >= W)
|
||
|
continue;
|
||
|
float i = input[n * C * H * W + c * H * W + h * W + w];
|
||
|
float f = filter[k * C * R * S + c * R * S + r * S + s];
|
||
|
o += i * f;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
__global__ void kernel_convolution(
|
||
|
int mpi_rank,
|
||
|
float *input, float *output, float *filter,
|
||
|
int N, int C, int H, int W,
|
||
|
int K, int R, int S, int OH, int OW,
|
||
|
int pad, int dilation, int stride)
|
||
|
{
|
||
|
int lr = threadIdx.y;
|
||
|
int ls = threadIdx.x;
|
||
|
int oh = blockIdx.x * RTILE_SIZE + lr;
|
||
|
int ow = blockIdx.y * STILE_SIZE + ls;
|
||
|
int n = blockIdx.z / K;
|
||
|
int k = blockIdx.z % K;
|
||
|
|
||
|
__shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE];
|
||
|
|
||
|
float o = 0.0;
|
||
|
for (int rtile = 0; rtile < R; rtile += RTILE_SIZE)
|
||
|
{
|
||
|
for (int stile = 0; stile < S; stile += STILE_SIZE)
|
||
|
{
|
||
|
for (int ctile = 0; ctile < C; ctile += CTILE_SIZE)
|
||
|
{
|
||
|
int r = rtile + lr;
|
||
|
int s = stile + ls;
|
||
|
int climit = min(ctile + CTILE_SIZE, C);
|
||
|
for (int c = ctile; c < climit; c++)
|
||
|
{
|
||
|
int lc = c % CTILE_SIZE;
|
||
|
if (r < R && s < S)
|
||
|
{
|
||
|
filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
filter_shared[lc][lr][ls] = 0.0;
|
||
|
}
|
||
|
}
|
||
|
__syncthreads();
|
||
|
|
||
|
for (int c = ctile; c < climit; c++)
|
||
|
{
|
||
|
for (int r = 0; r < RTILE_SIZE; r++)
|
||
|
{
|
||
|
for (int s = 0; s < STILE_SIZE; s++)
|
||
|
{
|
||
|
int lc = c % CTILE_SIZE;
|
||
|
int h = oh * stride - pad + (rtile + r) * dilation;
|
||
|
int w = ow * stride - pad + (stile + s) * dilation;
|
||
|
if (h < 0 || h >= H || w < 0 || w >= W)
|
||
|
continue;
|
||
|
float i = input[n * C * H * W + c * H * W + h * W + w];
|
||
|
float f = filter_shared[lc][r][s];
|
||
|
o += i * f;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
__syncthreads();
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (oh >= 0 && oh < OH && ow >= 0 && ow < OW)
|
||
|
{
|
||
|
output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int num_devices = 0;
|
||
|
static float *input, *output, *filter;
|
||
|
static int N, C, H, W;
|
||
|
static int K, R, S;
|
||
|
static int OH, OW;
|
||
|
static int pad;
|
||
|
static int dilation;
|
||
|
static int stride;
|
||
|
static int mpi_rank, mpi_world_size;
|
||
|
static int nstart, nend, nlen;
|
||
|
|
||
|
static MPI_Request request;
|
||
|
static MPI_Status status;
|
||
|
|
||
|
static float *input_d[MAX_NUM_GPU];
|
||
|
static float *filter_d[MAX_NUM_GPU];
|
||
|
static float *output_d[MAX_NUM_GPU];
|
||
|
static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU];
|
||
|
|
||
|
void convolution(
|
||
|
float *_input, float *_output, float *_filter,
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
input = _input;
|
||
|
output = _output;
|
||
|
filter = _filter;
|
||
|
|
||
|
#if (PRINT_DEBUG == 1)
|
||
|
print_filter(filter, K, C, R, S);
|
||
|
print_input(input, nlen, C, H, W);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
// printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen);
|
||
|
// cuda_device_malloc();
|
||
|
|
||
|
if (mpi_world_size == 1)
|
||
|
{
|
||
|
cuda_memcpy_host_to_device();
|
||
|
cuda_kernel_call();
|
||
|
cuda_memcpy_device_to_host();
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
int dest = 1;
|
||
|
nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
|
||
|
nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
|
||
|
nlen = nend - nstart;
|
||
|
|
||
|
MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
|
||
|
MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
|
||
|
|
||
|
nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
|
||
|
nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
|
||
|
nlen = nend - nstart;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
int source = 0;
|
||
|
MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
|
||
|
MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
|
||
|
zero_tensor(output, nlen, K, OH, OW);
|
||
|
}
|
||
|
|
||
|
cuda_memcpy_host_to_device();
|
||
|
cuda_kernel_call();
|
||
|
cuda_memcpy_device_to_host();
|
||
|
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
int source = 1;
|
||
|
nstart = N / mpi_world_size * source + min(source, N % mpi_world_size);
|
||
|
nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size);
|
||
|
nlen = nend - nstart;
|
||
|
MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
int dest = 0;
|
||
|
MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void convolution_init(
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
N = _N;
|
||
|
C = _C;
|
||
|
H = _H;
|
||
|
W = _W;
|
||
|
K = _K;
|
||
|
R = _R;
|
||
|
S = _S;
|
||
|
|
||
|
pad = _pad;
|
||
|
dilation = _dilation;
|
||
|
stride = _stride;
|
||
|
|
||
|
omp_set_num_threads(NUM_THREADS_PER_NODE);
|
||
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
||
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
||
|
|
||
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
||
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
||
|
|
||
|
if (mpi_world_size == 1)
|
||
|
{
|
||
|
// Only 1 node is usable
|
||
|
// Set start and end index for node #0
|
||
|
nstart = 0;
|
||
|
nend = N;
|
||
|
nlen = nend - nstart;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// 2 nodes are available
|
||
|
if (mpi_rank == 0)
|
||
|
{
|
||
|
int dest = 1;
|
||
|
nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
|
||
|
nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
|
||
|
|
||
|
// Send start and end index to node #1
|
||
|
MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
||
|
MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
|
||
|
|
||
|
nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
|
||
|
nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
|
||
|
nlen = nend - nstart;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
int source = 0;
|
||
|
// Receive start and end index from node #0
|
||
|
MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
||
|
MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
|
||
|
nlen = nend - nstart;
|
||
|
|
||
|
alloc_tensor(&input, nlen, C, H, W);
|
||
|
alloc_tensor(&output, nlen, K, OH, OW);
|
||
|
alloc_tensor(&filter, K, C, R, S);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cuda_device_init();
|
||
|
cuda_device_malloc();
|
||
|
}
|
||
|
|
||
|
void convolution_final(
|
||
|
int _N, int _C, int _H, int _W,
|
||
|
int _K, int _R, int _S,
|
||
|
int _pad, int _dilation, int _stride)
|
||
|
{
|
||
|
}
|
||
|
|
||
|
void cuda_device_init(void)
|
||
|
{
|
||
|
CUDA_CALL(cudaGetDeviceCount(&num_devices));
|
||
|
// num_devices = 1;
|
||
|
|
||
|
/// printf("Node #%d Using %d devices\n", mpi_rank, num_devices);
|
||
|
|
||
|
// for (int i = 0; i < num_devices; i++)
|
||
|
// {
|
||
|
// cudaDeviceProp prop;
|
||
|
// CUDA_CALL(cudaGetDeviceProperties(&prop, i));
|
||
|
|
||
|
// // Try printing more detailed information here
|
||
|
// // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name);
|
||
|
// }
|
||
|
|
||
|
if (num_devices <= 0)
|
||
|
{
|
||
|
printf("No CUDA device found. Aborting\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
// Setup problem size for each GPU
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices);
|
||
|
Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices);
|
||
|
Nlen[i] = Nend[i] - Nbegin[i];
|
||
|
#if (PRINT_DEBUG == 1)
|
||
|
printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]);
|
||
|
#endif
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_device_malloc(void)
|
||
|
{
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float)));
|
||
|
CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float)));
|
||
|
CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float)));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_memcpy_host_to_device(void)
|
||
|
{
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
CUDA_CALL(cudaMemcpy(input_d[i],
|
||
|
input + Nbegin[i] * C * H * W,
|
||
|
Nlen[i] * C * H * W * sizeof(float),
|
||
|
cudaMemcpyHostToDevice));
|
||
|
CUDA_CALL(cudaMemcpy(filter_d[i],
|
||
|
filter,
|
||
|
K * C * R * S * sizeof(float),
|
||
|
cudaMemcpyHostToDevice));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_memcpy_device_to_host(void)
|
||
|
{
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW,
|
||
|
output_d[i],
|
||
|
Nlen[i] * K * OH * OW * sizeof(float),
|
||
|
cudaMemcpyDeviceToHost));
|
||
|
}
|
||
|
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
CUDA_CALL(cudaDeviceSynchronize());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void cuda_kernel_call(void)
|
||
|
{
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
#if (KERNEL_VERSION == NAIVE)
|
||
|
// Naive Version
|
||
|
dim3 blockDim(1, 1);
|
||
|
dim3 gridDim(OH, OW);
|
||
|
#elif (KERNEL_VERSION == OPTIMIZED)
|
||
|
// Optimized Version
|
||
|
dim3 blockDim(STILE_SIZE, RTILE_SIZE);
|
||
|
dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K);
|
||
|
#endif
|
||
|
|
||
|
#if (PRINT_DEBUG == 1)
|
||
|
printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y);
|
||
|
printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z);
|
||
|
#endif
|
||
|
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
#if (KERNEL_VERSION == NAIVE)
|
||
|
kernel_convolution_naive<<<gridDim, blockDim>>>(mpi_rank,
|
||
|
input_d[i], output_d[i], filter_d[i],
|
||
|
Nlen[i], C, H, W,
|
||
|
K, R, S, OH, OW,
|
||
|
pad, dilation, stride);
|
||
|
#elif (KERNEL_VERSION == OPTIMIZED)
|
||
|
kernel_convolution<<<gridDim, blockDim>>>(mpi_rank,
|
||
|
input_d[i], output_d[i], filter_d[i],
|
||
|
Nlen[i], C, H, W,
|
||
|
K, R, S, OH, OW,
|
||
|
pad, dilation, stride);
|
||
|
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
#pragma parallel for
|
||
|
for (int i = 0; i < num_devices; i++)
|
||
|
{
|
||
|
CUDA_CALL(cudaSetDevice(i));
|
||
|
CUDA_CALL(cudaDeviceSynchronize());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void print_filter(float *filter, int K, int C, int R, int S)
|
||
|
{
|
||
|
printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S);
|
||
|
for (int k = 0; k < K; k++)
|
||
|
{
|
||
|
for (int c = 0; c < C; c++)
|
||
|
{
|
||
|
printf("(k, c): (%d, %d)\n", k, c);
|
||
|
for (int r = 0; r < R; r++)
|
||
|
{
|
||
|
for (int s = 0; s < S; s++)
|
||
|
{
|
||
|
printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]);
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
printf("\n\n");
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void print_input(float *input, int N, int C, int H, int W)
|
||
|
{
|
||
|
printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W);
|
||
|
for (int n = 0; n < N; n++)
|
||
|
{
|
||
|
for (int c = 0; c < C; c++)
|
||
|
{
|
||
|
printf("(n, c): (%d, %d)\n", n, c);
|
||
|
for (int h = 0; h < H; h++)
|
||
|
{
|
||
|
for (int w = 0; w < W; w++)
|
||
|
{
|
||
|
printf("%f ", input[n * C * H * W + c * H * W + h * W + w]);
|
||
|
}
|
||
|
printf("\n");
|
||
|
}
|
||
|
printf("\n\n");
|
||
|
}
|
||
|
}
|
||
|
}
|