#include "convolution.h" #include "util.h" #include #include #include #include static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define DEBUG (false) int num_devices = 0; static float *input_d[4]; static float *output_d[4]; static float *filter_d[4]; static int N_str[8], N_size[8]; __global__ void conv_kernel( float *input, float *filter, float *output, int stride, int pad, int dilation, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) { // Calc. Index int n = (blockDim.x * blockIdx.x + threadIdx.x) / K; int k = (blockDim.x * blockIdx.x + threadIdx.x) % K; int oh = blockDim.y * blockIdx.y + threadIdx.y; int ow = blockDim.z * blockIdx.z + threadIdx.z; if (n >= N || k >= K || oh >= OH || ow >= OW) // unaligned return; // Global to Shared extern __shared__ float filter_shared[]; for (int c = 0; c < C; c++) { for (int r = 0; r < R; r++) { for (int s = 0; s < S; s++) { filter_shared[(c * R * S) + (r * S) + s] = filter[(k * C * R * S) + (c * R * S) + (r * S) + s]; } } } __syncthreads(); // For loop (C-R-S) float o = 0.f; for (int c = 0; c < C; c++) { for (int r = 0; r < R; r++) { for (int s = 0; s < S; s++) { int h = oh * stride - pad + r * dilation; int w = ow * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input [(n * C * H * W) + (c * H * W) + (h * W) + w]; //float f = filter[(k * C * R * S) + (c * R * S) + (r * S) + s]; float f = filter_shared[(c * R * S) + (r * S) + s]; o += i * f; } } } output[(n * K * OH * OW) + (k * OH * OW) + (oh * OW) + ow] = o; } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { // Input Data Assign input = _input; output = _output; filter = _filter; // Split Node Data with MPI if (mpi_world_size > 1 && N > 4) { if (mpi_rank == 0) { if (DEBUG) printf("[rank %d] Data Send...\n", mpi_rank); // send 2nd half MPI_Request request0; MPI_Isend(input+(N_str[4])*C*H*W, (N_size[4]+N_size[5]+N_size[6]+N_size[7])*C*H*W, MPI_FLOAT, 1, 100, MPI_COMM_WORLD, &request0); MPI_Isend(filter, K*C*R*S, MPI_FLOAT, 1, 200, MPI_COMM_WORLD, &request0); } else { if (DEBUG) printf("[rank %d] Data Receive...\n", mpi_rank); // alloc same alloc_tensor(&input, N, C, H, W); // Alloc must here (not init) alloc_tensor(&output, N, K, OH, OW); alloc_tensor(&filter, K, C, R, S); //zero_tensor(output, (N_size[4]+N_size[5]+N_size[6]+N_size[7]), K, OH, OW); // optional zero // receive 2nd half MPI_Recv(input+(N_str[4])*C*H*W, (N_size[4]+N_size[5]+N_size[6]+N_size[7])*C*H*W, MPI_FLOAT, 0, 100, MPI_COMM_WORLD, nullptr); MPI_Recv(filter, K*C*R*S, MPI_FLOAT, 0, 200, MPI_COMM_WORLD, nullptr); } } // Upload matrix to every GPU for (int g = 0; g < num_devices; g++) { if (N_size[mpi_rank*4+g] > 0) { if (DEBUG) printf("[rank %d] GPU %d Upload... N_str:%d/N_size:%d\n", mpi_rank, g, N_str[mpi_rank*4+g], N_size[mpi_rank*4+g]); CUDA_CALL( cudaMemcpy(input_d[g], input+(N_str[mpi_rank*4+g]*C*H*W), N_size[mpi_rank*4+g]*C*H*W * sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(filter_d[g], filter, K*C*R*S * sizeof(float), cudaMemcpyHostToDevice) ); } } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaDeviceSynchronize() ); } // Launch kernel on every GPU for (int g = 0; g < num_devices; g++) { if (N_size[mpi_rank*4+g] > 0) { if (DEBUG) printf("[rank %d] GPU %d Process...\n", mpi_rank, g); int NK_block = 1; int OH_block = 2; int OW_block = 64; // Max Block 1024 (x*y*z) int NK_grid = ((N_size[g] * K) + (NK_block - 1)) / NK_block; int OH_grid = (OH + (OH_block - 1)) / OH_block; int OW_grid = (OW + (OW_block - 1)) / OW_block; dim3 blockDim(NK_block, OH_block, OW_block); dim3 gridDim(NK_grid, OH_grid, OW_grid); CUDA_CALL( cudaSetDevice(g) ); conv_kernel<<>>( input_d[g], filter_d[g], output_d[g], stride, pad, dilation, N_size[g], C, H, W, K, R, S, OH, OW ); } } for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaDeviceSynchronize() ); } // Merge GPU for (int g = 0; g < num_devices; g++) { if (N_size[mpi_rank*4+g] > 0) { if (DEBUG) printf("[rank %d] GPU %d Merge... N_str:%d/N_size:%d\n", mpi_rank, g, N_str[mpi_rank*4+g], N_size[mpi_rank*4+g]); CUDA_CALL( cudaMemcpy(output+(N_str[mpi_rank*4+g]*K*OH*OW), output_d[g], N_size[mpi_rank*4+g]*K*OH*OW * sizeof(float), cudaMemcpyDeviceToHost) ); } } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaDeviceSynchronize() ); } // Merge Node Data with MPI if (mpi_world_size > 1 && N > 4) { if (mpi_rank == 0) { if (DEBUG) printf("[rank %d] Data Receive...\n", mpi_rank); MPI_Recv(output+(N_str[4])*K*OH*OW, (N_size[4]+N_size[5]+N_size[6]+N_size[7])*K*OH*OW, MPI_FLOAT, 1, 300, MPI_COMM_WORLD, nullptr); } else { if (DEBUG) printf("[rank %d] Data Send...\n", mpi_rank); MPI_Request request1; // Must for Isend MPI_Isend(output+(N_str[4])*K*OH*OW, (N_size[4]+N_size[5]+N_size[6]+N_size[7])*K*OH*OW, MPI_FLOAT, 0, 300, MPI_COMM_WORLD, &request1); } } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; // Calc Output Size OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; //if (DEBUG) printf("[convolution %d] OH:%d, OW:%d...\n", mpi_rank, OH, OW); // MPI Memory Allocation MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); // Calc. Both Node if (mpi_world_size > 1) { // 2 Nodes (8 GPUs) N_size[0] = (N/8) + (N%8 > 0); N_size[1] = (N/8) + (N%8 > 1); N_size[2] = (N/8) + (N%8 > 2); N_size[3] = (N/8) + (N%8 > 3); N_size[4] = (N/8) + (N%8 > 4); N_size[5] = (N/8) + (N%8 > 5); N_size[6] = (N/8) + (N%8 > 6); N_size[7] = (N/8); N_str[0] = 0; N_str[1] = N_str[0] + N_size[0]; N_str[2] = N_str[1] + N_size[1]; N_str[3] = N_str[2] + N_size[2]; N_str[4] = N_str[3] + N_size[3]; N_str[5] = N_str[4] + N_size[4]; N_str[6] = N_str[5] + N_size[5]; N_str[7] = N_str[6] + N_size[6]; } else { // 1 Node (4 GPUs) N_size[0] = (N/4) + (N%4 > 0); N_size[1] = (N/4) + (N%4 > 1); N_size[2] = (N/4) + (N%4 > 2); N_size[3] = (N/4); N_str[0] = 0; N_str[1] = N_str[0] + N_size[0]; N_str[2] = N_str[1] + N_size[1]; N_str[3] = N_str[2] + N_size[2]; } // GPU Memory Allocation CUDA_CALL( cudaGetDeviceCount(&num_devices) ); printf("[rank %d] Using %d devices\n", mpi_rank, num_devices); for (int g = 0; g < num_devices; g++) { cudaDeviceProp prop; CUDA_CALL( cudaGetDeviceProperties(&prop, g) ); printf("[rank %d] GPU %d : %s\n", mpi_rank, g, prop.name); } if (num_devices <= 0) { printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank); exit(1); } // Setup size for each GPU for (int g = 0; g < num_devices; g++) { if (N_size[mpi_rank*4+g] > 0) { CUDA_CALL( cudaSetDevice(g) ); CUDA_CALL( cudaMalloc(&input_d [g], N_size[0]*C*H*W * sizeof(float)) ); // Max size CUDA_CALL( cudaMalloc(&filter_d[g], K*C*R*S * sizeof(float)) ); CUDA_CALL( cudaMalloc(&output_d[g], N_size[0]*K*OH*OW * sizeof(float)) ); } } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaDeviceSynchronize() ); } for (int g = 0; g < num_devices; g++) { CUDA_CALL( cudaFree(input_d[g]) ); CUDA_CALL( cudaFree(filter_d[g]) ); CUDA_CALL( cudaFree(output_d[g]) ); } }