chundoong-lab-ta/SamsungDS22/submissions/final/seongje.jang/B/convolution.cu

287 lines
8.8 KiB
Plaintext

#include "util.h"
#include <mpi.h>
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int filter_size, im_size, col_size, res_size;
#define MAX_NODES 2
static int ns[MAX_NODES], ne[MAX_NODES];
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define TS (32)
#define WPT (16)
#define RTS (2)
#define MAX_NUM_GPU 4
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];
int num_devices = 0;
// Array of device (GPU) pointers
static float *input_d[MAX_NUM_GPU];
static float *input_col_d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];
__global__ void im2col_gpu_kernel(const int n, const float* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
float* data_col) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; \
index < (n); \
index += blockDim.x * gridDim.x) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
float* data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float* data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h_im = h_offset + i * dilation_h;
int w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__global__ void sgemm(float *A, float *B, float *C, int M, int N, int K, int numTiles) {
const int row = threadIdx.x;
const int col = threadIdx.y;
const int globalRow = TS * blockIdx.x + threadIdx.x;
const int globalCol = TS * blockIdx.y + threadIdx.y;
__shared__ float Asub[TS][TS];
__shared__ float Bsub[TS][TS];
float acc[WPT];
for(int w=0; w<WPT; w++) {
acc[w] = 0.0f;
}
for(int t=0; t<numTiles; t++) {
for(int w=0; w<WPT; w++) {
const int tiledRow = TS*t + row;
const int tiledCol = TS*t + col;
if((globalRow + w*RTS) < M && tiledCol < K)
Asub[w*RTS + row][col] = A[(globalRow + w*RTS)*K + tiledCol];
else
Asub[w*RTS + row][col] = 0.0f;
if((tiledRow + w*RTS) < K && globalCol < N)
Bsub[row + w*RTS][col] = B[(tiledRow + w*RTS)*N + globalCol];
else
Bsub[row + w*RTS][col] = 0.0f;
}
__syncthreads();
for(int k=0; k<TS; k++) {
for(int w=0; w<WPT; w++) {
acc[w] += Asub[row + w*RTS][k] * Bsub[k][col];
}
}
__syncthreads();
}
for(int w=0; w<WPT; w++) {
if((globalRow + w*RTS) < M && globalCol < N)
C[(globalRow + w*RTS)*N + globalCol] = acc[w];
}
}
void im2col_gpu(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
float* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int height_col = (height + 2 * pad_h -
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
int width_col = (width + 2 * pad_w -
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
int num_kernels = channels * height_col * width_col;
// NOLINT_NEXT_LINE(whitespace/operators)
const int num_threads = 512;
int num_blocks = (num_kernels + num_threads - 1) / num_threads;
im2col_gpu_kernel<<<num_blocks, num_threads>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
width_col, data_col);
}
void convolution(
float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
input = _input;
output = _output;
filter = _filter;
if(mpi_rank != 0) {
alloc_tensor(&input, N, C, H, W);
alloc_tensor(&output, N, K, OH, OW);
alloc_tensor(&filter, K, C, R, S);
}
// Scatter input
if(mpi_rank == 0) {
for(int i=1; i < mpi_world_size; i++) {
MPI_Send(input + ns[i]*im_size, (ne[i]-ns[i])*im_size, MPI_FLOAT, i, 0,
MPI_COMM_WORLD);
}
}
else {
MPI_Recv(input + ns[mpi_rank]*im_size, (ne[mpi_rank]-ns[mpi_rank])*im_size, MPI_FLOAT,
0, 0, MPI_COMM_WORLD, nullptr);
}
// Broadcast filter
MPI_Bcast(filter, filter_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
// Upload A and B matrix to every GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaMemcpy(input_d[i], input+(ns[mpi_rank]+Mbegin[i])*im_size,
(Mend[i]-Mbegin[i])*im_size*sizeof(float), cudaMemcpyHostToDevice) );
CUDA_CALL( cudaMemcpy(filter_d[i], filter, filter_size*sizeof(float),
cudaMemcpyHostToDevice) );
}
int M_XL = (K+TS-1)/TS*TS;
int N_XL = (OH*OW+TS-1)/TS*TS;
int K_XL = (C*R*S+TS-1)/TS*TS;
int numTiles = K_XL/TS;
dim3 blockDim(RTS, TS, 1);
dim3 gridDim(M_XL/TS, N_XL/TS, 1);
// Launch kernel on every GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
for(int j = 0; j < Mend[i]-Mbegin[i]; j++) {
im2col_gpu(input_d[i]+j*im_size, C, H, W, R, S, pad, pad, stride, stride, dilation, dilation, input_col_d[i]);
sgemm<<<gridDim, blockDim>>>(filter_d[i], input_col_d[i], output_d[i]+j*res_size, K, OH*OW, C*R*S, numTiles);
}
}
// Download C matrix from GPUs
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaMemcpy(output + (ns[mpi_rank]+Mbegin[i])*res_size, output_d[i],
(Mend[i]-Mbegin[i])*res_size*sizeof(float),
cudaMemcpyDeviceToHost) );
}
// Gather C
if (mpi_rank == 0) {
for (int i = 1; i < mpi_world_size; i++) {
MPI_Recv(output + ns[i]*res_size, (ne[i]-ns[i])*res_size, MPI_FLOAT, i, 0,
MPI_COMM_WORLD, nullptr);
}
} else {
MPI_Send(output + ns[mpi_rank]*res_size, (ne[mpi_rank]-ns[mpi_rank])*res_size, MPI_FLOAT, 0, 0,
MPI_COMM_WORLD);
}
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaDeviceSynchronize() );
}
}
void convolution_init(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
N = _N; C = _C; H = _H; W = _W;
K = _K; R = _R; S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
//printf("Using %d devices\n", num_devices);
for (int i = 0; i < num_devices; i++) {
cudaDeviceProp prop;
CUDA_CALL( cudaGetDeviceProperties(&prop, i) );
// Try printing more detailed information here
//printf("[GPU %d] %s\n", i, prop.name);
}
if (num_devices <= 0) {
printf("No CUDA device found. Aborting\n");
exit(1);
}
for(int i=0; i < mpi_world_size; i++) {
ns[i] = N/mpi_world_size *i;
ne[i] = N/mpi_world_size*(i+1);
}
ne[mpi_world_size-1] = N;
// Setup problem size for each GPU
int batch_size = ne[mpi_rank]-ns[mpi_rank];
for (int i = 0; i < num_devices; i++) {
Mbegin[i] = (batch_size / num_devices) * i;
Mend[i] = (batch_size / num_devices) * (i + 1);
}
Mend[num_devices - 1] = batch_size;
im_size = C*H*W;
col_size = C*R*S*OH*OW;
res_size = K*OH*OW;
filter_size = K*C*R*S;
// Allocate device memory for each GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaMalloc(&input_d[i], (Mend[i]-Mbegin[i])*im_size*sizeof(float)) );
CUDA_CALL( cudaMalloc(&input_col_d[i], col_size*sizeof(float)) );
CUDA_CALL( cudaMalloc(&filter_d[i], filter_size*sizeof(float)) );
CUDA_CALL( cudaMalloc(&output_d[i], (Mend[i]-Mbegin[i])*res_size*sizeof(float)) );
}
}
void convolution_final(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
}