#include "convolution.h" #include #include #include "util.h" #include #include #include #define TS 32 static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int size[2]; #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define MAX_NUM_GPU 4 int num_devices = 0; static float *input__d[MAX_NUM_GPU]; static float *filter_d[MAX_NUM_GPU]; static float *output_d[MAX_NUM_GPU]; static int N_str [MAX_NUM_GPU]; static int N_siz [MAX_NUM_GPU]; void cuda_init() ; void cuda_final() ; // Convolution Reference // for (int n = 0; n < N; ++n) { // for (int k = 0; k < K; ++k) { // for (int oh = 0; oh < OH; ++oh) { // for (int ow = 0; ow < OW; ++ow) { // float o = 0.f; // for (int c = 0; c < C; ++c) { // for (int r = 0; r < R; ++r) { // for (int s = 0; s < S; ++s) { // int h = oh * stride - pad + r * dilation; // int w = ow * stride - pad + s * dilation; // if (h < 0 || h >= H || w < 0 || w >= W) continue; // float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w]; // float f = filter_d[k_ptr * C * R * S + c * R * S + r * S + s]; // o += i * f; // } // s // } // r // } // c // out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o; // } //ow // } // oh // } // k // } // n // CUDA Kernel 1 -- Naive Loop version __global__ void kernel1(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) { const int ow_thread = threadIdx.x ; const int oh_thread = threadIdx.y ; const int ow_block = blockIdx.x ; const int oh_block = blockIdx.y ; const int ow = (ow_block * blockDim.x) + ow_thread ; const int oh = (oh_block * blockDim.y) + oh_thread ; const int k_ptr = blockIdx.z % K ; const int n_ptr = blockIdx.z / K ; if ((ow >= OW) || (oh >= OH)) return ; // Check Output Boundary float o = 0.f; for (int c = 0; c < C; ++c) { for (int r = 0; r < R; ++r) { for (int s = 0; s < S; ++s) { int h = oh * stride - pad + r * dilation; int w = ow * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; // check Input boundary float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w]; float f = filter_d [k_ptr * C * R * S + c * R * S + r * S + s]; o += i * f; } // s } // r } // c out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o; } // CUDA Kernel 2 -- Indexing Calculation strength reduction // Filter Loop Unrolling __global__ void kernel2(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) { const int ow_thread = threadIdx.x ; const int oh_thread = threadIdx.y ; const int ow_block = blockIdx.x ; const int oh_block = blockIdx.y ; const int ow = (ow_block * blockDim.x) + ow_thread ; const int oh = (oh_block * blockDim.y) + oh_thread ; const int k_ptr = blockIdx.z % K ; const int n_ptr = blockIdx.z / K ; const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ; const int i_base = n_ptr * C * H * W ; const int f_base = k_ptr * C * R * S ; float i[8] ; float f[8] ; const int s_start = S / 8 * 8 ; if ((ow >= OW) || (oh >= OH)) return ; float o = 0.f; for (int c = 0; c < C; ++c) { int i_base_c = i_base + c * H * W ; int f_base_c = f_base + c * R * S ; for (int r = 0; r < R ; r++ ){ for (int s = 0; s < s_start; s += 8) { int h = oh + r; int w = ow + s; i[ 0] = in_d [i_base_c + h * W + w + 0] ; i[ 1] = in_d [i_base_c + h * W + w + 1] ; i[ 2] = in_d [i_base_c + h * W + w + 2] ; i[ 3] = in_d [i_base_c + h * W + w + 3] ; i[ 4] = in_d [i_base_c + h * W + w + 4] ; i[ 5] = in_d [i_base_c + h * W + w + 5] ; i[ 6] = in_d [i_base_c + h * W + w + 6] ; i[ 7] = in_d [i_base_c + h * W + w + 7] ; f[ 0] = filter_d[f_base_c + r * S + s + 0] ; f[ 1] = filter_d[f_base_c + r * S + s + 1] ; f[ 2] = filter_d[f_base_c + r * S + s + 2] ; f[ 3] = filter_d[f_base_c + r * S + s + 3] ; f[ 4] = filter_d[f_base_c + r * S + s + 4] ; f[ 5] = filter_d[f_base_c + r * S + s + 5] ; f[ 6] = filter_d[f_base_c + r * S + s + 6] ; f[ 7] = filter_d[f_base_c + r * S + s + 7] ; o = i[ 0] * f[ 0] + i[ 1] * f[ 1] + i[ 2] * f[ 2] + i[ 3] * f[ 3] + i[ 4] * f[ 4] + i[ 5] * f[ 5] + i[ 6] * f[ 6] + i[ 7] * f[ 7] + o ; } // s for (int s = s_start; s < S ; ++s) { int h = oh + r; int w = ow + s; float i = in_d [i_base_c + h * W + w]; float f = filter_d[f_base_c + r * S + s]; o += i * f; } // s } // r } // c out_d[o_idx + oh * OW + ow] = o; } // CUDA Kernel 3 -- Filter Load usisng float4 vector // Called when S is multiple of 4 //-- __global__ void kernel3(float *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) { //-- //-- const int ow_thread = threadIdx.x ; //-- const int oh_thread = threadIdx.y ; //-- const int ow_block = blockIdx.x ; //-- const int oh_block = blockIdx.y ; //-- //-- const int ow = (ow_block * blockDim.x) + ow_thread ; //-- const int oh = (oh_block * blockDim.y) + oh_thread ; //-- //-- const int k_ptr = blockIdx.z % K ; //-- const int n_ptr = blockIdx.z / K ; //-- //-- const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ; //-- const int i_base = n_ptr * C * H * W ; //-- const int f_base = k_ptr * C * R * S ; //-- //-- float4 i ; //-- float4 f ; //-- float4 vo ; //-- //-- if ((ow >= OW) || (oh >= OH)) return ; //-- float o = 0.f; //-- for (int c = 0; c < C; ++c) { //-- int i_base_c = i_base + c * H * W ; //-- int f_base_c = f_base + c * R * S ; //-- //-- for (int r = 0; r < R ; r++ ){ //-- for (int s = 0; s < S ; s += 4) { //-- //-- int h = oh + r; //-- int w = ow + s; //-- //-- i.x = in_d [i_base_c + h * W + w + 0] ; //-- i.y = in_d [i_base_c + h * W + w + 1] ; //-- i.z = in_d [i_base_c + h * W + w + 2] ; //-- i.w = in_d [i_base_c + h * W + w + 3] ; //-- //-- f = filter_d[(f_base_c + r * S + s)/4]; //-- //-- vo = make_float4(i.x * f.x, i.y * f.y, i.z * f.z , i.w * f.w); //-- //-- o = vo.x + vo.y + vo.z + vo.w + o; //-- } // s //-- } // r //-- } // c //-- out_d[o_idx + oh * OW + ow] = o; //-- } // CUDA Kernel 4 -- Enlarging Work for thread from 1 output to 4 outputs // Input Image Load using float4 vector // Filter Load using float4 vector // Called when both of S and W are multiple of 4 __global__ void kernel4(float4 *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) { const int ow_thread = threadIdx.x ; const int oh_thread = threadIdx.y ; const int ow_block = blockIdx.x ; const int oh_block = blockIdx.y ; const int ow_base = (ow_block * blockDim.x + ow_thread)* 4 ; const int oh = (oh_block * blockDim.y) + oh_thread ; const int k_ptr = blockIdx.z % K ; const int n_ptr = blockIdx.z / K ; const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ; const int i_base = n_ptr * C * H * W ; const int f_base = k_ptr * C * R * S ; float4 i0 ; float4 i1 ; float i[8] ; float4 f ; float4 vo [4] ; if (oh >= OH) return ; float o[4] = {0.f, 0.f, 0.f, 0.f}; for (int c = 0; c < C; ++c) { int i_base_c = i_base + c * H * W ; int f_base_c = f_base + c * R * S ; for (int r = 0; r < R ; r++ ){ int h = oh + r; for (int s = 0; s < S ; s += 4) { f = filter_d[(f_base_c + r * S + s)/4]; // load input data to compute 4 OWs i0 = in_d [(i_base_c + h * W + ow_base + s ) / 4]; i1 = in_d [(i_base_c + h * W + ow_base + s + 4) / 4]; i[0] = i0.x ; i[1] = i0.y ; i[2] = i0.z ; i[3] = i0.w ; i[4] = i1.x ; i[5] = i1.y ; i[6] = i1.z ; i[7] = i1.w ; vo[0] = make_float4(i[0] * f.x, i[1] * f.y, i[2] * f.z , i[3] * f.w); vo[1] = make_float4(i[1] * f.x, i[2] * f.y, i[3] * f.z , i[4] * f.w); vo[2] = make_float4(i[2] * f.x, i[3] * f.y, i[4] * f.z , i[5] * f.w); vo[3] = make_float4(i[3] * f.x, i[4] * f.y, i[5] * f.z , i[6] * f.w); o [0] = vo[0].x + vo[0].y + vo[0].z + vo[0].w + o[0]; o [1] = vo[1].x + vo[1].y + vo[1].z + vo[1].w + o[1]; o [2] = vo[2].x + vo[2].y + vo[2].z + vo[2].w + o[2]; o [3] = vo[3].x + vo[3].y + vo[3].z + vo[3].w + o[3]; } // s } // r } // c if (ow_base + 0 < OW) out_d[o_idx + oh * OW + ow_base + 0] = o[0]; if (ow_base + 1 < OW) out_d[o_idx + oh * OW + ow_base + 1] = o[1]; if (ow_base + 2 < OW) out_d[o_idx + oh * OW + ow_base + 2] = o[2]; if (ow_base + 3 < OW) out_d[o_idx + oh * OW + ow_base + 3] = o[3]; } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { // Assign static global variables input = _input ; output = _output ; filter = _filter ; MPI_Request request; MPI_Status status; if (mpi_rank == 0 && mpi_world_size == 2) { MPI_Isend(&input[size[0]*C*H*W], size[1]*C*H*W, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request); MPI_Isend( filter , K*C*R*S , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request); } else if (mpi_world_size == 2) { alloc_tensor(&input , size[1], C, H , W ); alloc_tensor(&output, size[1], K, OH, OW); alloc_tensor(&filter, K , C, R , S ); MPI_Recv(input , size[1]*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(filter, K*C*R*S , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); } // Cuda Memory Copy Host to Device cuda_init(); // Grid Dimension int grid_X_cnt = (OW + TS - 1) / TS ; int grid_Y_cnt = (OH + TS - 1) / TS ; // Launch kernel on every GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); if ((pad == 0) && (dilation == 1) && (stride == 1)) { // No Multiply Operation to compute Array Index if ((S%4 == 0) && (W%4==0)) { // Using Cuda Vector type dim3 blockDim( TS / 4 , TS , 1 ); // Max Thread = 1024 dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K); kernel4<<>>((float4*)input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW); //dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024 //dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K); //kernel3<<>>(input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW); } else { // Scalar Memory Access dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024 dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K); kernel2<<>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW); } } else { // Using multiply operation dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024 dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K); kernel1<<>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, pad, dilation, stride, OH, OW); } //cudaError_t cudaRunStatus = cudaGetLastError(); //if (cudaRunStatus != cudaSuccess) { // fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__ - 3, cudaRunStatus, cudaGetErrorString(cudaRunStatus)); // exit(1); //} } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } // Cuda Mem Copy Device to Host cuda_final(); // MPI Result Gather if (mpi_rank == 0 && mpi_world_size == 2) { MPI_Recv(&output[size[0]*K*OH*OW], size[1]*K*OH*OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status); } else if(mpi_world_size == 2){ MPI_Isend(output, size[1]*K*OH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request); } } void cuda_init(){ // Upload A and B matrix to every GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaMemcpy(input__d[i], &input[N_str[i] * C * H * W], N_siz[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(filter_d[i], filter , K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) ); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } } void cuda_final(){ // Download C matrix from GPUs for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaMemcpy(&output[N_str[i] * K * OH * OW], output_d[i], N_siz[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) ); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.4f); else size[1] = 0 ; size[0] = N - size[1]; // Cuda Device Initialize CUDA_CALL( cudaGetDeviceCount(&num_devices) ); //num_devices = 1; for (int i = 0; i < num_devices; i++) { cudaDeviceProp prop; CUDA_CALL( cudaGetDeviceProperties(&prop, i) ); // Try printing more detailed information here printf("[GPU %d] %s\n", i, prop.name); } if (num_devices <= 0) { printf("No CUDA device found. Aborting\n"); exit(1); } // Setup problem size for each GPU int mod = size[mpi_rank] % num_devices ; for (int i = 0 ; i < num_devices ; i++){ N_siz[i] = size[mpi_rank] / num_devices ; } for (int i = 0 ; i < mod ; i++){ N_siz[i] ++ ; } N_str [0] = 0 ; for (int i = 1 ; i < num_devices ; i++){ N_str[i] = N_str[i - 1] + N_siz[i - 1]; } // Allocate device memory for each GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMalloc(&input__d[i], N_siz[i] * C* H *W * sizeof(float)) ); CUDA_CALL( cudaMalloc(&filter_d[i], K * C* R *S * sizeof(float)) ); CUDA_CALL( cudaMalloc(&output_d[i], N_siz[i] * K* OH*OW * sizeof(float)) ); } //printf("\n\n Size[0] = %d, Size[1] = %d\n\n", size[0], size[1]); } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { }