chundoong-lab-ta/SamsungDS22/submissions/final/seong81.kim/B/convolution.cu

481 lines
19 KiB
Plaintext
Raw Normal View History

2022-09-29 18:01:45 +09:00
#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include "util.h"
#include <immintrin.h>
#include <cstdio>
#include <cuda_runtime.h>
#define TS 32
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int size[2];
#define CUDA_CALL(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#define MAX_NUM_GPU 4
int num_devices = 0;
static float *input__d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];
static int N_str [MAX_NUM_GPU];
static int N_siz [MAX_NUM_GPU];
void cuda_init() ;
void cuda_final() ;
// Convolution Reference
// for (int n = 0; n < N; ++n) {
// for (int k = 0; k < K; ++k) {
// for (int oh = 0; oh < OH; ++oh) {
// for (int ow = 0; ow < OW; ++ow) {
// float o = 0.f;
// for (int c = 0; c < C; ++c) {
// for (int r = 0; r < R; ++r) {
// for (int s = 0; s < S; ++s) {
// int h = oh * stride - pad + r * dilation;
// int w = ow * stride - pad + s * dilation;
// if (h < 0 || h >= H || w < 0 || w >= W) continue;
// float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];
// float f = filter_d[k_ptr * C * R * S + c * R * S + r * S + s];
// o += i * f;
// } // s
// } // r
// } // c
// out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
// } //ow
// } // oh
// } // k
// } // n
// CUDA Kernel 1 -- Naive Loop version
__global__ void kernel1(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) {
const int ow_thread = threadIdx.x ;
const int oh_thread = threadIdx.y ;
const int ow_block = blockIdx.x ;
const int oh_block = blockIdx.y ;
const int ow = (ow_block * blockDim.x) + ow_thread ;
const int oh = (oh_block * blockDim.y) + oh_thread ;
const int k_ptr = blockIdx.z % K ;
const int n_ptr = blockIdx.z / K ;
if ((ow >= OW) || (oh >= OH)) return ; // Check Output Boundary
float o = 0.f;
for (int c = 0; c < C; ++c) {
for (int r = 0; r < R; ++r) {
for (int s = 0; s < S; ++s) {
int h = oh * stride - pad + r * dilation;
int w = ow * stride - pad + s * dilation;
if (h < 0 || h >= H || w < 0 || w >= W) continue; // check Input boundary
float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];
float f = filter_d [k_ptr * C * R * S + c * R * S + r * S + s];
o += i * f;
} // s
} // r
} // c
out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
}
// CUDA Kernel 2 -- Indexing Calculation strength reduction
// Filter Loop Unrolling
__global__ void kernel2(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
const int ow_thread = threadIdx.x ;
const int oh_thread = threadIdx.y ;
const int ow_block = blockIdx.x ;
const int oh_block = blockIdx.y ;
const int ow = (ow_block * blockDim.x) + ow_thread ;
const int oh = (oh_block * blockDim.y) + oh_thread ;
const int k_ptr = blockIdx.z % K ;
const int n_ptr = blockIdx.z / K ;
const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
const int i_base = n_ptr * C * H * W ;
const int f_base = k_ptr * C * R * S ;
float i[8] ;
float f[8] ;
const int s_start = S / 8 * 8 ;
if ((ow >= OW) || (oh >= OH)) return ;
float o = 0.f;
for (int c = 0; c < C; ++c) {
int i_base_c = i_base + c * H * W ;
int f_base_c = f_base + c * R * S ;
for (int r = 0; r < R ; r++ ){
for (int s = 0; s < s_start; s += 8) {
int h = oh + r;
int w = ow + s;
i[ 0] = in_d [i_base_c + h * W + w + 0] ;
i[ 1] = in_d [i_base_c + h * W + w + 1] ;
i[ 2] = in_d [i_base_c + h * W + w + 2] ;
i[ 3] = in_d [i_base_c + h * W + w + 3] ;
i[ 4] = in_d [i_base_c + h * W + w + 4] ;
i[ 5] = in_d [i_base_c + h * W + w + 5] ;
i[ 6] = in_d [i_base_c + h * W + w + 6] ;
i[ 7] = in_d [i_base_c + h * W + w + 7] ;
f[ 0] = filter_d[f_base_c + r * S + s + 0] ;
f[ 1] = filter_d[f_base_c + r * S + s + 1] ;
f[ 2] = filter_d[f_base_c + r * S + s + 2] ;
f[ 3] = filter_d[f_base_c + r * S + s + 3] ;
f[ 4] = filter_d[f_base_c + r * S + s + 4] ;
f[ 5] = filter_d[f_base_c + r * S + s + 5] ;
f[ 6] = filter_d[f_base_c + r * S + s + 6] ;
f[ 7] = filter_d[f_base_c + r * S + s + 7] ;
o = i[ 0] * f[ 0] +
i[ 1] * f[ 1] +
i[ 2] * f[ 2] +
i[ 3] * f[ 3] +
i[ 4] * f[ 4] +
i[ 5] * f[ 5] +
i[ 6] * f[ 6] +
i[ 7] * f[ 7] +
o ;
} // s
for (int s = s_start; s < S ; ++s) {
int h = oh + r;
int w = ow + s;
float i = in_d [i_base_c + h * W + w];
float f = filter_d[f_base_c + r * S + s];
o += i * f;
} // s
} // r
} // c
out_d[o_idx + oh * OW + ow] = o;
}
// CUDA Kernel 3 -- Filter Load usisng float4 vector
// Called when S is multiple of 4
//-- __global__ void kernel3(float *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
//--
//-- const int ow_thread = threadIdx.x ;
//-- const int oh_thread = threadIdx.y ;
//-- const int ow_block = blockIdx.x ;
//-- const int oh_block = blockIdx.y ;
//--
//-- const int ow = (ow_block * blockDim.x) + ow_thread ;
//-- const int oh = (oh_block * blockDim.y) + oh_thread ;
//--
//-- const int k_ptr = blockIdx.z % K ;
//-- const int n_ptr = blockIdx.z / K ;
//--
//-- const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
//-- const int i_base = n_ptr * C * H * W ;
//-- const int f_base = k_ptr * C * R * S ;
//--
//-- float4 i ;
//-- float4 f ;
//-- float4 vo ;
//--
//-- if ((ow >= OW) || (oh >= OH)) return ;
//-- float o = 0.f;
//-- for (int c = 0; c < C; ++c) {
//-- int i_base_c = i_base + c * H * W ;
//-- int f_base_c = f_base + c * R * S ;
//--
//-- for (int r = 0; r < R ; r++ ){
//-- for (int s = 0; s < S ; s += 4) {
//--
//-- int h = oh + r;
//-- int w = ow + s;
//--
//-- i.x = in_d [i_base_c + h * W + w + 0] ;
//-- i.y = in_d [i_base_c + h * W + w + 1] ;
//-- i.z = in_d [i_base_c + h * W + w + 2] ;
//-- i.w = in_d [i_base_c + h * W + w + 3] ;
//--
//-- f = filter_d[(f_base_c + r * S + s)/4];
//--
//-- vo = make_float4(i.x * f.x, i.y * f.y, i.z * f.z , i.w * f.w);
//--
//-- o = vo.x + vo.y + vo.z + vo.w + o;
//-- } // s
//-- } // r
//-- } // c
//-- out_d[o_idx + oh * OW + ow] = o;
//-- }
// CUDA Kernel 4 -- Enlarging Work for thread from 1 output to 4 outputs
// Input Image Load using float4 vector
// Filter Load using float4 vector
// Called when both of S and W are multiple of 4
__global__ void kernel4(float4 *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
const int ow_thread = threadIdx.x ;
const int oh_thread = threadIdx.y ;
const int ow_block = blockIdx.x ;
const int oh_block = blockIdx.y ;
const int ow_base = (ow_block * blockDim.x + ow_thread)* 4 ;
const int oh = (oh_block * blockDim.y) + oh_thread ;
const int k_ptr = blockIdx.z % K ;
const int n_ptr = blockIdx.z / K ;
const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
const int i_base = n_ptr * C * H * W ;
const int f_base = k_ptr * C * R * S ;
float4 i0 ;
float4 i1 ;
float i[8] ;
float4 f ;
float4 vo [4] ;
if (oh >= OH) return ;
float o[4] = {0.f, 0.f, 0.f, 0.f};
for (int c = 0; c < C; ++c) {
int i_base_c = i_base + c * H * W ;
int f_base_c = f_base + c * R * S ;
for (int r = 0; r < R ; r++ ){
int h = oh + r;
for (int s = 0; s < S ; s += 4) {
f = filter_d[(f_base_c + r * S + s)/4];
// load input data to compute 4 OWs
i0 = in_d [(i_base_c + h * W + ow_base + s ) / 4];
i1 = in_d [(i_base_c + h * W + ow_base + s + 4) / 4];
i[0] = i0.x ;
i[1] = i0.y ;
i[2] = i0.z ;
i[3] = i0.w ;
i[4] = i1.x ;
i[5] = i1.y ;
i[6] = i1.z ;
i[7] = i1.w ;
vo[0] = make_float4(i[0] * f.x, i[1] * f.y, i[2] * f.z , i[3] * f.w);
vo[1] = make_float4(i[1] * f.x, i[2] * f.y, i[3] * f.z , i[4] * f.w);
vo[2] = make_float4(i[2] * f.x, i[3] * f.y, i[4] * f.z , i[5] * f.w);
vo[3] = make_float4(i[3] * f.x, i[4] * f.y, i[5] * f.z , i[6] * f.w);
o [0] = vo[0].x + vo[0].y + vo[0].z + vo[0].w + o[0];
o [1] = vo[1].x + vo[1].y + vo[1].z + vo[1].w + o[1];
o [2] = vo[2].x + vo[2].y + vo[2].z + vo[2].w + o[2];
o [3] = vo[3].x + vo[3].y + vo[3].z + vo[3].w + o[3];
} // s
} // r
} // c
if (ow_base + 0 < OW)
out_d[o_idx + oh * OW + ow_base + 0] = o[0];
if (ow_base + 1 < OW)
out_d[o_idx + oh * OW + ow_base + 1] = o[1];
if (ow_base + 2 < OW)
out_d[o_idx + oh * OW + ow_base + 2] = o[2];
if (ow_base + 3 < OW)
out_d[o_idx + oh * OW + ow_base + 3] = o[3];
}
void convolution(
float *_input, float *_output, float *_filter,
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
// Assign static global variables
input = _input ;
output = _output ;
filter = _filter ;
MPI_Request request;
MPI_Status status;
if (mpi_rank == 0 && mpi_world_size == 2) {
MPI_Isend(&input[size[0]*C*H*W], size[1]*C*H*W, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
MPI_Isend( filter , K*C*R*S , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
}
else if (mpi_world_size == 2) {
alloc_tensor(&input , size[1], C, H , W );
alloc_tensor(&output, size[1], K, OH, OW);
alloc_tensor(&filter, K , C, R , S );
MPI_Recv(input , size[1]*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(filter, K*C*R*S , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
}
// Cuda Memory Copy Host to Device
cuda_init();
// Grid Dimension
int grid_X_cnt = (OW + TS - 1) / TS ;
int grid_Y_cnt = (OH + TS - 1) / TS ;
// Launch kernel on every GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
if ((pad == 0) && (dilation == 1) && (stride == 1)) { // No Multiply Operation to compute Array Index
if ((S%4 == 0) && (W%4==0)) { // Using Cuda Vector type
dim3 blockDim( TS / 4 , TS , 1 ); // Max Thread = 1024
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
kernel4<<<gridDim, blockDim>>>((float4*)input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
//dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
//dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
//kernel3<<<gridDim, blockDim>>>(input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
}
else { // Scalar Memory Access
dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
kernel2<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
}
} else { // Using multiply operation
dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
kernel1<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, pad, dilation, stride, OH, OW);
}
//cudaError_t cudaRunStatus = cudaGetLastError();
//if (cudaRunStatus != cudaSuccess) {
// fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__ - 3, cudaRunStatus, cudaGetErrorString(cudaRunStatus));
// exit(1);
//}
}
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaDeviceSynchronize() );
}
// Cuda Mem Copy Device to Host
cuda_final();
// MPI Result Gather
if (mpi_rank == 0 && mpi_world_size == 2) {
MPI_Recv(&output[size[0]*K*OH*OW], size[1]*K*OH*OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);
}
else if(mpi_world_size == 2){
MPI_Isend(output, size[1]*K*OH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
}
}
void cuda_init(){
// Upload A and B matrix to every GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaMemcpy(input__d[i], &input[N_str[i] * C * H * W], N_siz[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) );
CUDA_CALL( cudaMemcpy(filter_d[i], filter , K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) );
}
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaDeviceSynchronize() );
}
}
void cuda_final(){
// Download C matrix from GPUs
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaMemcpy(&output[N_str[i] * K * OH * OW], output_d[i], N_siz[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) );
}
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaDeviceSynchronize() );
}
}
void convolution_init(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
N = _N; C = _C; H = _H; W = _W;
K = _K; R = _R; S = _S;
pad = _pad;
dilation = _dilation;
stride = _stride;
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.4f);
else size[1] = 0 ;
size[0] = N - size[1];
// Cuda Device Initialize
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
//num_devices = 1;
for (int i = 0; i < num_devices; i++) {
cudaDeviceProp prop;
CUDA_CALL( cudaGetDeviceProperties(&prop, i) );
// Try printing more detailed information here
printf("[GPU %d] %s\n", i, prop.name);
}
if (num_devices <= 0) {
printf("No CUDA device found. Aborting\n");
exit(1);
}
// Setup problem size for each GPU
int mod = size[mpi_rank] % num_devices ;
for (int i = 0 ; i < num_devices ; i++){
N_siz[i] = size[mpi_rank] / num_devices ;
}
for (int i = 0 ; i < mod ; i++){
N_siz[i] ++ ;
}
N_str [0] = 0 ;
for (int i = 1 ; i < num_devices ; i++){
N_str[i] = N_str[i - 1] + N_siz[i - 1];
}
// Allocate device memory for each GPU
for (int i = 0; i < num_devices; i++) {
CUDA_CALL( cudaSetDevice(i) );
CUDA_CALL( cudaMalloc(&input__d[i], N_siz[i] * C* H *W * sizeof(float)) );
CUDA_CALL( cudaMalloc(&filter_d[i], K * C* R *S * sizeof(float)) );
CUDA_CALL( cudaMalloc(&output_d[i], N_siz[i] * K* OH*OW * sizeof(float)) );
}
//printf("\n\n Size[0] = %d, Size[1] = %d\n\n", size[0], size[1]);
}
void convolution_final(
int _N, int _C, int _H, int _W,
int _K, int _R, int _S,
int _pad, int _dilation, int _stride) {
}