chundoong-lab-ta/SamsungDS22/submissions/final/km.hero.lee/B/convolution.cu

353 lines
11 KiB
Plaintext
Raw Normal View History

2022-09-29 18:01:45 +09:00
#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "util.h"
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define MAX_NUM_NODE 2
#define MAX_NUM_GPU 4
#define SGEMM_BSIZE 32
#define IM2COL_NTHREADS 1024
/* Variables for MPI */
static int num_devices;
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad, dilation, stride;
static int mpi_rank, mpi_world_size;
static int wbegin[MAX_NUM_NODE], wend[MAX_NUM_NODE];
static int mbegin[MAX_NUM_GPU], mend[MAX_NUM_GPU];
static int rounded_M, rounded_N;
/* Variables for GPU devices' */
static float *h_input[MAX_NUM_GPU];
static float *h_output[MAX_NUM_GPU];
static float *d_input[MAX_NUM_GPU];
static float *d_filter[MAX_NUM_GPU];
static float *d_col[MAX_NUM_GPU];
static float *d_output[MAX_NUM_GPU];
static cudaStream_t stream[MAX_NUM_GPU]; // cuda stream
/* GPU kernel */
__global__ void gpu_im2col(const int n, const float *data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad, const int stride, const int dilation, const int height_col,
const int width_col, float* data_col);
__global__ void gpu_sgemm(float *A, float *B, float *C, int M, int N, int K);
static void convolution_gpu (void)
{
int n;
int im2col_kernels = C * OH * OW;
int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS;
dim3 sgemm_blockDim(SGEMM_BSIZE, SGEMM_BSIZE);
dim3 sgemm_gridDim(rounded_N/SGEMM_BSIZE, rounded_M/SGEMM_BSIZE);
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaMemcpy(h_input[i],
input + mbegin[i] * C * H * W,
(mend[i] - mbegin[i]) * C * H * W * sizeof(float),
cudaMemcpyHostToHost) );
CUDA_CALL( cudaMemcpy(d_filter[i], filter, K * C * R * S * sizeof(float),
cudaMemcpyHostToDevice) );
}
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaDeviceSynchronize() );
}
for (n=0; n<(mend[0] - mbegin[0]); n++) {
for (int i=0; i<num_devices; i++) {
if (n >= (mend[i] - mbegin[i])) {
continue;
}
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaMemcpyAsync(d_input[i],
h_input[i] + n * C * H * W,
C * H * W * sizeof(float),
cudaMemcpyHostToDevice, stream[i]) );
gpu_im2col<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[i]>>>(
im2col_kernels,
d_input[i],
H, W, R, S,
pad, stride, dilation, OH, OW, d_col[i]);
gpu_sgemm<<<sgemm_gridDim, sgemm_blockDim, 0, stream[i]>>>(
d_filter[i], d_col[i],
d_output[i],
K, OH * OW, R * S * C);
CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW,
d_output[i],
K * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost, stream[i]) );
}
}
int d = num_devices - 1;
for (n=n; n<(mend[d] - mbegin[d]); n++) {
CUDA_CALL( cudaSetDevice(d) );
CUDA_CALL( cudaMemcpyAsync(d_input[d],
h_input[d] + n * C * H * W,
C * H * W * sizeof(float),
cudaMemcpyHostToDevice, stream[d]) );
gpu_im2col<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[d]>>>(
im2col_kernels,
d_input[d],
H, W, R, S,
pad, stride, dilation, OH, OW, d_col[d]);
gpu_sgemm<<<sgemm_gridDim, sgemm_blockDim, 0, stream[d]>>>(
d_filter[d], d_col[d],
d_output[d],
K, OH * OW, R * S * C);
CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW,
d_output[d],
K * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost, stream[d]) );
}
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaDeviceSynchronize() );
}
for (int i=0; i<num_devices; i++) {
memcpy(output + mbegin[i] * K * OH * OW,
h_output[i],
(mend[i] - mbegin[i]) * K * OH * OW * sizeof(float));
}
}
void convolution (float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride)
{
input = _input;
output = _output;
filter = _filter;
if (mpi_rank != 0) {
alloc_tensor(&input, N, C, H, W);
alloc_tensor(&filter, K, C, R, S);
alloc_tensor(&output, N, K, OH, OW);
}
/* Scatter */
if (mpi_rank == 0) {
for (int i=1; i<mpi_world_size; i++) {
MPI_Send(input + wbegin[i] * C * H * W, (wend[i] - wbegin[i]) * C * H * W,
MPI_FLOAT, i, 0, MPI_COMM_WORLD);
}
} else {
MPI_Recv(input, (wend[mpi_rank] - wbegin[mpi_rank]) * C * H * W,
MPI_FLOAT, 0, 0, MPI_COMM_WORLD, NULL);
}
/* Broadcast */
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
/* Computation */
if (wend[mpi_rank] - wbegin[mpi_rank] > 0) {
convolution_gpu();
}
/* Gather */
if (mpi_rank == 0) {
for (int i=1; i<mpi_world_size; i++) {
MPI_Recv(output + wbegin[i] * K * OH * OW, (wend[i] - wbegin[i]) * K * OH * OW,
MPI_FLOAT, i, 1, MPI_COMM_WORLD, NULL);
}
} else {
MPI_Send(output,
(wend[mpi_rank] - wbegin[mpi_rank]) * K * OH * OW,
MPI_FLOAT, 0, 1, MPI_COMM_WORLD);
}
}
void convolution_init (
int _N, int _C, int _H, int _W,
int _K, int _R, int _S, int _pad, int _dilation, int _stride)
{
N = _N; C = _C; H = _H; W = _W;
K = _K; R = _R; S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
rounded_M = (K + SGEMM_BSIZE - 1) / SGEMM_BSIZE * SGEMM_BSIZE;
rounded_N = ((OH * OW) + SGEMM_BSIZE - 1) / SGEMM_BSIZE * SGEMM_BSIZE;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
if (mpi_rank == 0) {
printf("Using %d compute nodes\n", mpi_world_size);
}
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
printf("Node %d: Using %d devices\n", mpi_rank, num_devices);
if (num_devices <= 0) {
printf("No CUDA device. Aborting\n");
exit(1);
}
/* Distribute work to nodes */
for (int i=0; i<mpi_world_size; i++) {
wbegin[i] = N / mpi_world_size * i;
wend[i] = N / mpi_world_size * (i + 1);
}
wend[mpi_world_size - 1] = N;
/* Distribute work to devices */
for (int i=0; i<num_devices; i++) {
mbegin[i] = ((wend[mpi_rank] - wbegin[mpi_rank]) / num_devices) * i;
mend[i] = ((wend[mpi_rank] - wbegin[mpi_rank]) / num_devices) * (i + 1);
}
mend[num_devices - 1] = (wend[mpi_rank] - wbegin[mpi_rank]);
/* Memory allocate */
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaStreamCreate(&stream[i]) );
CUDA_CALL( cudaStreamCreate(&stream[i]) );
CUDA_CALL( cudaMallocHost(&h_input[i], (mend[i] - mbegin[i]) * C * H * W * sizeof(float)) );
CUDA_CALL( cudaMallocHost(&h_output[i], (mend[i] - mbegin[i]) * K * OH * OW * sizeof(float)) );
CUDA_CALL( cudaMalloc(&d_input[i], C * H * W * sizeof(float)) );
CUDA_CALL( cudaMalloc(&d_filter[i], rounded_M * C * R * S * sizeof(float)) );
CUDA_CALL( cudaMalloc(&d_output[i], rounded_M * rounded_N * sizeof(float)) );
CUDA_CALL( cudaMalloc(&d_col[i], R * S * C * OH * OW * sizeof(float)) );
CUDA_CALL( cudaMemset(d_filter[i], 0, rounded_M * C * R * S * sizeof(float)) );
CUDA_CALL( cudaMemset(d_output[i], 0, rounded_M * rounded_N * sizeof(float)) );
CUDA_CALL( cudaMemset(d_col[i], 0, R * S * C * OH * OW * sizeof(float)) );
}
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaDeviceSynchronize() );
}
}
void convolution_final (
int _N, int _C, int _H, int _W,
int _K, int _R, int _S, int _pad, int _dilation, int _stride)
{
for (int i=0; i<num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaFree(d_input[i]) );
CUDA_CALL( cudaFree(d_filter[i]) );
CUDA_CALL( cudaFree(d_output[i]) );
CUDA_CALL( cudaFree(d_col[i]) );
CUDA_CALL( cudaFreeHost(h_input[i]) );
CUDA_CALL( cudaFreeHost(h_output[i]) );
CUDA_CALL( cudaStreamDestroy(stream[i]) );
}
}
__global__ void gpu_im2col(const int n, const float *data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad, const int stride, const int dilation, const int height_col,
const int width_col, float* data_col)
{
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < n;
index += blockDim.x * gridDim.x) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride - pad;
const int w_offset = w_col * stride - pad;
float *data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float *data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i=0; i<kernel_h; ++i) {
for (int j=0; j<kernel_w; ++j) {
int h_im = h_offset + i * dilation;
int w_im = w_offset + j * dilation;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation * width + j * dilation] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__global__ void gpu_sgemm(float *A, float *B, float *C,
int M, int N, int K)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = K * SGEMM_BSIZE * by;
int aEnd = aBegin + K - 1;
int aStep = SGEMM_BSIZE;
int bBegin = SGEMM_BSIZE * bx;
int bStep = SGEMM_BSIZE * N;
float Csub = 0;
for (int a=aBegin, b=bBegin; a <= aEnd; a += aStep, b += bStep) {
__shared__ float As[SGEMM_BSIZE][SGEMM_BSIZE];
__shared__ float Bs[SGEMM_BSIZE][SGEMM_BSIZE];
As[ty][tx] = A[a + K * ty + tx];
Bs[ty][tx] = B[b + N * ty + tx];
__syncthreads();
#pragma unroll
for (int k = 0; k < SGEMM_BSIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
if (bx * SGEMM_BSIZE + tx < N) {
int c = N * SGEMM_BSIZE * by + SGEMM_BSIZE * bx;
C[c + N * ty + tx] = Csub;
}
}