#include "convolution.h" #include #include #include #include "util.h" // NVIDIA GeForce RTX 3090 // - Computing capability: 8.6 // - Maximum number of threads per block: 1024 // - Maximum amount of shared memory per SM: 100KB // - Maximum amount of shared memory per thread block : 99KB // - Maximum amount of local memory per thread: 512KB // Maximum x-dimension of a grid of thread blocks: 2^31-1 // Maximum y- or z-dimension of a grid of thread blocks: 65535 // - In a SM: // 128 FP32 cores in a SM // 64 INT32 cores for integer math // 4 warp schedulers #define MAX_NUM_GPU 4 #define MAX_THREAD_PER_BLOCK 8 // 16 x 16 = 256 #define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) ) #define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) ) #define MIN(a,b) ( ((a) < (b)) ? (a) : (b) ) #define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__) #define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__) #define CUDA_CALL(d, f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \ mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \ exit(1); \ } \ } static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int num_devices; // Array of device (GPU) pointers static float *i_d[MAX_NUM_GPU]; static float *f_d[MAX_NUM_GPU]; static float *o_d[MAX_NUM_GPU]; __global__ void conv_core(float *in, float *flt, float *out, int N, int K, int C, int H, int W, int R, int S, int OH, int OW, int stride, int pad, int dilation) { const int col = threadIdx.x; const int row = threadIdx.y; const int globalCol = blockDim.x * blockIdx.x + col; const int globalRow = blockDim.y * blockIdx.y + row; if (globalCol < OW && globalRow < OH) { for (int n = 0; n < N; ++n) { for (int k = 0; k < K; ++k) { float *outForK = &out[n * K * OH * OW + k * OH * OW]; float o = 0.0f; for (int c = 0; c < C; ++c) { float *inForC = &in[n * C * H * W + c * H * W]; float *fltForC = &flt[k * C * R * S + c * R * S]; for (int r = 0; r < R; r++) { for (int s = 0; s < S; s++) { int h = globalRow * stride - pad + r * dilation; int w = globalCol * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = inForC[h * W + w]; float f = fltForC[r * S + s]; o += i * f; // printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n", // globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o); } } } outForK[globalRow * OW + globalCol] = o; CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n", n, k, globalRow, globalCol, &outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]); } } } } #define MPI_ASYNC 1 void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { #if MPI_ASYNC == 1 MPI_Request reqs[3] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL }; #endif const int slicedN = _N / mpi_world_size; //////////////////////////////////////////////////////////////////////////////// // scatter if (mpi_rank == 0) { input = _input; output = _output; filter = _filter; } if (slicedN > 0) { #if MPI_ASYNC == 1 MPI_Ibcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]); #else MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD); #endif if (mpi_rank == 0) { const int sizeOfN = slicedN * C * H * W; float *inputForNodes = input + (N * C * H * W); for (int i = 1; i < mpi_world_size; i++) { #if MPI_ASYNC == 1 MPI_Isend(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &reqs[1]); #else MPI_Send(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD); #endif } } else { #if MPI_ASYNC == 1 MPI_Irecv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &reqs[1]); #else MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); #endif // zero_tensor(output, N, K, OH, OW); } #if MPI_ASYNC == 1 MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE); #endif } //////////////////////////////////////////////////////////////////////////////// // calc int slicedK = K / num_devices; int startK[MAX_NUM_GPU]; int sizeK[MAX_NUM_GPU]; for (int d = 0; d < num_devices; d++) { startK[d] = slicedK * d; sizeK[d] = slicedK; } sizeK[num_devices - 1] = K - startK[num_devices - 1]; for (int d = 0; d < num_devices; d++) { if (N <= 0 || sizeK[d] <= 0) { continue; } CUDA_CALL( d, cudaSetDevice(d) ); // Upload input data to every GPU CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n", mpi_rank, d, i_d[d], input, N * C * H * W * sizeof(float), N * C * H * W * sizeof(float)); CUDA_CALL( d, cudaMemcpyAsync(i_d[d], input, N * C * H * W * sizeof(float), cudaMemcpyHostToDevice) ); // Upload filter data to every GPU CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n", mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float)); CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), cudaMemcpyHostToDevice) ); // Launch kernel on every GPU dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1); dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1); CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p" ", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d" ", stride=%d, pad=%d, dilation=%d\n", mpi_rank, d, i_d[d], f_d[d], o_d[d], N, sizeK[d], C, H, W, R, S, OH, OW, stride, pad, dilation); conv_core<<>>(i_d[d], f_d[d], o_d[d], N, sizeK[d], C, H, W, R, S, OH, OW, stride, pad, dilation); } // Download output data from GPUs for (int d = 0; d < num_devices; d++) { if (N <= 0 || sizeK[d] <= 0) { continue; } CUDA_CALL( d, cudaSetDevice(d) ); for (int n = 0; n < N; n++) { CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n", mpi_rank, d, &output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW, sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float)); CUDA_CALL( d, cudaMemcpyAsync(&output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW, sizeK[d] * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) ); } } for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaDeviceSynchronize() ); } //////////////////////////////////////////////////////////////////////////////// // gather if (slicedN > 0) { if (mpi_rank == 0) { const int sizeOfN = slicedN * K * OH * OW; float *outputForNodes = output + (N * K * OH * OW); for (int i = 1; i < mpi_world_size; i++) { #if MPI_ASYNC == 1 MPI_Irecv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, &reqs[2]); #else MPI_Recv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); #endif } } else { #if MPI_ASYNC == 1 MPI_Isend(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &reqs[2]); #else MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD); #endif } #if MPI_ASYNC == 1 MPI_Wait(&reqs[2], MPI_STATUS_IGNORE); #endif } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); N = _N / mpi_world_size; if (mpi_rank == 0) { N = _N - N * (mpi_world_size - 1); } else { if (N > 0) { alloc_tensor(&input, N, C, H, W); alloc_tensor(&filter, K, C, R, S); alloc_tensor(&output, N, K, OH, OW); } } CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) ); if (num_devices > MAX_NUM_GPU) { num_devices = MAX_NUM_GPU; } if (num_devices <= 0) { printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank); exit(1); } if (N > 0) { // Allocate device memory for each GPU for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaSetDevice(d) ); CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256 CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16 CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256 } for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaSetDevice(d) ); CUDA_CALL( d, cudaDeviceSynchronize() ); } } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { // CUDA if (N > 0) { for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaDeviceSynchronize() ); } for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaSetDevice(d) ); CUDA_CALL( d, cudaFree(i_d[d]) ); CUDA_CALL( d, cudaFree(f_d[d]) ); CUDA_CALL( d, cudaFree(o_d[d]) ); } for (int d = 0; d < num_devices; d++) { CUDA_CALL( d, cudaDeviceSynchronize() ); } } // MPI MPI_Barrier(MPI_COMM_WORLD); if (N > 0) { if (mpi_rank != 0) { free(input); free(filter); free(output); } } }