#include "convolution.h" #include "util.h" #include #include #include #include #define NAIVE 0 #define OPTIMIZED 1 #define PRINT_DEBUG 0 // #define KERNEL_VERSION NAIVE #define KERNEL_VERSION OPTIMIZED #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) \ { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define MAX_NUM_GPU 4 #define NUM_THREADS_PER_NODE 40 #define CTILE_SIZE 8 #define RTILE_SIZE 16 #define STILE_SIZE 16 #define min(A, B) (((A) > (B)) ? (B) : (A)) __global__ void kernel_convolution_naive( int mpi_rank, float *input, float *output, float *filter, int N, int C, int H, int W, int K, int R, int S, int OH, int OW, int pad, int dilation, int stride) { int oh = blockIdx.x; int ow = blockIdx.y; for (int n = 0; n < N; n++) { for (int k = 0; k < K; k++) { float o = 0.0; for (int c = 0; c < C; c++) { for (int r = 0; r < R; r++) { for (int s = 0; s < S; s++) { int h = oh * stride - pad + r * dilation; int w = ow * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter[k * C * R * S + c * R * S + r * S + s]; o += i * f; } } } output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; } } } __global__ void kernel_convolution( int mpi_rank, float *input, float *output, float *filter, int N, int C, int H, int W, int K, int R, int S, int OH, int OW, int pad, int dilation, int stride) { int lr = threadIdx.y; int ls = threadIdx.x; int oh = blockIdx.x * RTILE_SIZE + lr; int ow = blockIdx.y * STILE_SIZE + ls; int n = blockIdx.z / K; int k = blockIdx.z % K; __shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE]; float o = 0.0; for (int rtile = 0; rtile < R; rtile += RTILE_SIZE) { for (int stile = 0; stile < S; stile += STILE_SIZE) { for (int ctile = 0; ctile < C; ctile += CTILE_SIZE) { int r = rtile + lr; int s = stile + ls; int climit = min(ctile + CTILE_SIZE, C); for (int c = ctile; c < climit; c++) { int lc = c % CTILE_SIZE; if (r < R && s < S) { filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s]; } else { filter_shared[lc][lr][ls] = 0.0; } } __syncthreads(); for (int c = ctile; c < climit; c++) { for (int r = 0; r < RTILE_SIZE; r++) { for (int s = 0; s < STILE_SIZE; s++) { int lc = c % CTILE_SIZE; int h = oh * stride - pad + (rtile + r) * dilation; int w = ow * stride - pad + (stile + s) * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter_shared[lc][r][s]; o += i * f; } } } __syncthreads(); } } } if (oh >= 0 && oh < OH && ow >= 0 && ow < OW) { output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; } } static int num_devices = 0; static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int nstart, nend, nlen; static MPI_Request request; static MPI_Status status; static float *input_d[MAX_NUM_GPU]; static float *filter_d[MAX_NUM_GPU]; static float *output_d[MAX_NUM_GPU]; static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU]; void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { if (mpi_rank == 0) { input = _input; output = _output; filter = _filter; #if (PRINT_DEBUG == 1) print_filter(filter, K, C, R, S); print_input(input, nlen, C, H, W); #endif } // printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen); // cuda_device_malloc(); if (mpi_world_size == 1) { cuda_memcpy_host_to_device(); cuda_kernel_call(); cuda_memcpy_device_to_host(); } else { if (mpi_rank == 0) { int dest = 1; nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size); nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size); nlen = nend - nstart; MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request); MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request); nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size); nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size); nlen = nend - nstart; } else { int source = 0; MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); zero_tensor(output, nlen, K, OH, OW); } cuda_memcpy_host_to_device(); cuda_kernel_call(); cuda_memcpy_device_to_host(); if (mpi_rank == 0) { int source = 1; nstart = N / mpi_world_size * source + min(source, N % mpi_world_size); nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size); nlen = nend - nstart; MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); } else { int dest = 0; MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request); } } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; omp_set_num_threads(NUM_THREADS_PER_NODE); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; if (mpi_world_size == 1) { // Only 1 node is usable // Set start and end index for node #0 nstart = 0; nend = N; nlen = nend - nstart; } else { // 2 nodes are available if (mpi_rank == 0) { int dest = 1; nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size); nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size); // Send start and end index to node #1 MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request); MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request); nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size); nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size); nlen = nend - nstart; } else { int source = 0; // Receive start and end index from node #0 MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); nlen = nend - nstart; alloc_tensor(&input, nlen, C, H, W); alloc_tensor(&output, nlen, K, OH, OW); alloc_tensor(&filter, K, C, R, S); } } cuda_device_init(); cuda_device_malloc(); } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { } void cuda_device_init(void) { CUDA_CALL(cudaGetDeviceCount(&num_devices)); // num_devices = 1; /// printf("Node #%d Using %d devices\n", mpi_rank, num_devices); // for (int i = 0; i < num_devices; i++) // { // cudaDeviceProp prop; // CUDA_CALL(cudaGetDeviceProperties(&prop, i)); // // Try printing more detailed information here // // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name); // } if (num_devices <= 0) { printf("No CUDA device found. Aborting\n"); exit(1); } // Setup problem size for each GPU #pragma parallel for for (int i = 0; i < num_devices; i++) { Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices); Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices); Nlen[i] = Nend[i] - Nbegin[i]; #if (PRINT_DEBUG == 1) printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]); #endif } } void cuda_device_malloc(void) { #pragma parallel for for (int i = 0; i < num_devices; i++) { CUDA_CALL(cudaSetDevice(i)); CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float))); CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float))); CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float))); } } void cuda_memcpy_host_to_device(void) { #pragma parallel for for (int i = 0; i < num_devices; i++) { CUDA_CALL(cudaSetDevice(i)); CUDA_CALL(cudaMemcpy(input_d[i], input + Nbegin[i] * C * H * W, Nlen[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice)); CUDA_CALL(cudaMemcpy(filter_d[i], filter, K * C * R * S * sizeof(float), cudaMemcpyHostToDevice)); } } void cuda_memcpy_device_to_host(void) { #pragma parallel for for (int i = 0; i < num_devices; i++) { CUDA_CALL(cudaSetDevice(i)); CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW, output_d[i], Nlen[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost)); } #pragma parallel for for (int i = 0; i < num_devices; i++) { CUDA_CALL(cudaSetDevice(i)); CUDA_CALL(cudaDeviceSynchronize()); } } void cuda_kernel_call(void) { #pragma parallel for for (int i = 0; i < num_devices; i++) { #if (KERNEL_VERSION == NAIVE) // Naive Version dim3 blockDim(1, 1); dim3 gridDim(OH, OW); #elif (KERNEL_VERSION == OPTIMIZED) // Optimized Version dim3 blockDim(STILE_SIZE, RTILE_SIZE); dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K); #endif #if (PRINT_DEBUG == 1) printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y); printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z); #endif CUDA_CALL(cudaSetDevice(i)); #if (KERNEL_VERSION == NAIVE) kernel_convolution_naive<<>>(mpi_rank, input_d[i], output_d[i], filter_d[i], Nlen[i], C, H, W, K, R, S, OH, OW, pad, dilation, stride); #elif (KERNEL_VERSION == OPTIMIZED) kernel_convolution<<>>(mpi_rank, input_d[i], output_d[i], filter_d[i], Nlen[i], C, H, W, K, R, S, OH, OW, pad, dilation, stride); #endif } #pragma parallel for for (int i = 0; i < num_devices; i++) { CUDA_CALL(cudaSetDevice(i)); CUDA_CALL(cudaDeviceSynchronize()); } } void print_filter(float *filter, int K, int C, int R, int S) { printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S); for (int k = 0; k < K; k++) { for (int c = 0; c < C; c++) { printf("(k, c): (%d, %d)\n", k, c); for (int r = 0; r < R; r++) { for (int s = 0; s < S; s++) { printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]); } printf("\n"); } printf("\n\n"); } } } void print_input(float *input, int N, int C, int H, int W) { printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W); for (int n = 0; n < N; n++) { for (int c = 0; c < C; c++) { printf("(n, c): (%d, %d)\n", n, c); for (int h = 0; h < H; h++) { for (int w = 0; w < W; w++) { printf("%f ", input[n * C * H * W + c * H * W + h * W + w]); } printf("\n"); } printf("\n\n"); } } }