#include "convolution.h" #include #include #include #include "util.h" #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } // definitions #define MAX_NODE_COUNT 2 #define MAX_GPU_COUNT 4 #define SGEMM_BLOCKSIZE 16 #define IM2COL_NTHREADS 1024 // MPI's per-node variables static int num_devices; static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad, dilation, stride; static int mpi_rank, mpi_world_size; static int wbegin[MAX_NODE_COUNT], wend[MAX_NODE_COUNT]; static int wsize[MAX_NODE_COUNT]; static int mbegin[MAX_NODE_COUNT][MAX_GPU_COUNT]; static int mend[MAX_NODE_COUNT][MAX_GPU_COUNT]; static int msize[MAX_NODE_COUNT][MAX_GPU_COUNT]; static int rounded_M, rounded_N; // GPU devices' variables static float *h_input[MAX_GPU_COUNT]; static float *h_output[MAX_GPU_COUNT]; static float *d_input[MAX_GPU_COUNT]; static float *d_filter[MAX_GPU_COUNT]; static float *d_col[MAX_GPU_COUNT]; static float *d_output[MAX_GPU_COUNT]; static cudaStream_t stream[MAX_GPU_COUNT]; // function declaration __global__ void im2col_gpu_kernel(const int n, const float *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad, const int stride, const int dilation, const int height_col, const int width_col, float* data_col); __global__ void sgemm_gpu_kernel(float *A, float *B, float *C, int M, int N, int K); // static function static void convolution_gpu(void) { int n; int im2col_kernels = C * OH * OW; int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS; dim3 sgemm_blockDim(SGEMM_BLOCKSIZE, SGEMM_BLOCKSIZE); dim3 sgemm_gridDim(rounded_N/SGEMM_BLOCKSIZE, rounded_M/SGEMM_BLOCKSIZE); for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaMemcpy(d_filter[i], filter, K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) ); } for (n = 0; n < msize[mpi_rank][0]; n++) { for (int i = 0; i < num_devices; i++) { if (n >= msize[mpi_rank][i]) { continue; } CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMemcpyAsync(d_input[i], h_input[i] + n * C * H * W, C * H * W * sizeof(float), cudaMemcpyHostToDevice, stream[i]) ); im2col_gpu_kernel<<>>( im2col_kernels, d_input[i], H, W, R, S, pad, stride, dilation, OH, OW, d_col[i]); sgemm_gpu_kernel<<>>( d_filter[i], d_col[i], d_output[i], K, OH * OW, R * S * C); CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW, d_output[i], K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) ); } } int d = num_devices - 1; for (n = n; n < msize[mpi_rank][d]; n++) { CUDA_CALL( cudaSetDevice(d) ); CUDA_CALL( cudaMemcpyAsync(d_input[d], h_input[d] + n * C * H * W, C * H * W * sizeof(float), cudaMemcpyHostToDevice, stream[d]) ); im2col_gpu_kernel<<>>( im2col_kernels, d_input[d], H, W, R, S, pad, stride, dilation, OH, OW, d_col[d]); sgemm_gpu_kernel<<>>( d_filter[d], d_col[d], d_output[d], K, OH * OW, R * S * C); CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW, d_output[d], K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost, stream[d]) ); } for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; MPI_Request req[MAX_NODE_COUNT][MAX_GPU_COUNT]; if (mpi_rank != 0) { alloc_tensor(&filter, K, C, R, S); } // scattering if (mpi_rank == 0) { for (int i = 1; i < mpi_world_size; i++) { for (int j = 0; j < num_devices; j++) { MPI_Isend(input + (wbegin[i] + mbegin[i][j]) * C * H * W, msize[i][j] * C * H * W, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &req[i - 1][j]); } } for (int i = 0; i < num_devices; i++) { memcpy(h_input[i], input + (wbegin[0] + mbegin[0][i]) * C * H * W, msize[0][i] * C * H * W * sizeof(float)); } if (mpi_world_size > 1) { MPI_Waitall((mpi_world_size - 1) * num_devices, &req[0][0], MPI_STATUSES_IGNORE); } } else { for (int i = 0; i < num_devices; i++) { MPI_Irecv(h_input[i], msize[mpi_rank][i] * C * H * W * sizeof(float), MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &req[0][i]); } if (wsize[mpi_rank] > 0) { MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE); } } // broadcasting if (mpi_world_size > 1) { MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD); } // computation if (wsize[mpi_rank] > 0){ convolution_gpu(); } // gathering if (mpi_rank == 0) { for (int i = 1; i < mpi_world_size; i++) { for (int j = 0; j < num_devices; j++) { MPI_Irecv(output + (wbegin[i] + mbegin[i][j]) * K * OH * OW, msize[i][j] * K * OH * OW, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &req[i - 1][j]); } } for (int i = 0; i < num_devices; i++) { memcpy(output + (wbegin[0] + mbegin[0][i]) * K * OH * OW, h_output[i], msize[0][i] * K * OH * OW * sizeof(float)); } MPI_Waitall((mpi_world_size - 1) * num_devices, &req[0][0], MPI_STATUSES_IGNORE); } else { for (int i = 0; i < num_devices; i++) { MPI_Isend(h_output[i], msize[mpi_rank][i] * K * OH * OW, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &req[0][i]); } MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE); } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; rounded_M = (K + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE; rounded_N = ((OH * OW) + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); if (mpi_rank == 0) { printf("Using %d compute nodes\n", mpi_world_size); } CUDA_CALL( cudaGetDeviceCount(&num_devices) ); printf("Node %d: Using %d devices\n", mpi_rank, num_devices); if (num_devices <= 0) { printf("No CUDA device. Aborting\n"); exit(1); } // work distribution to nodes for (int i = 0; i < mpi_world_size; i++) { wbegin[i] = N / mpi_world_size * i; wend[i] = N / mpi_world_size * (i + 1); } wend[mpi_world_size - 1] = N; for (int i = 0; i < mpi_world_size; i++) { wsize[i] = wend[i] - wbegin[i]; } // work distribution to devices for (int i = 0; i < mpi_world_size; i++) { for (int j = 0; j < num_devices; j++) { mbegin[i][j] = (wsize[i] / num_devices) * j; mend[i][j] = (wsize[i] / num_devices) * (j + 1); } mend[i][num_devices - 1] = wsize[i]; } for (int i = 0; i < mpi_world_size; i++) { for (int j = 0; j < num_devices; j++) { msize[i][j] = mend[i][j] - mbegin[i][j]; } } // memory allocation for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaStreamCreate(&stream[i]) ); CUDA_CALL( cudaMallocHost(&h_input[i], msize[mpi_rank][i] * C * H * W * sizeof(float)) ); CUDA_CALL( cudaMallocHost(&h_output[i], msize[mpi_rank][i] * K * OH * OW * sizeof(float)) ); CUDA_CALL( cudaMalloc(&d_input[i], C * H * W * sizeof(float)) ); CUDA_CALL( cudaMalloc(&d_filter[i], rounded_M * C * R * S * sizeof(float)) ); CUDA_CALL( cudaMalloc(&d_output[i], rounded_M * rounded_N * sizeof(float)) ); CUDA_CALL( cudaMalloc(&d_col[i], R * S * C * OH * OW * sizeof(float)) ); CUDA_CALL( cudaMemset(d_filter[i], 0, rounded_M * C * R * S * sizeof(float)) ); CUDA_CALL( cudaMemset(d_output[i], 0, rounded_M * rounded_N * sizeof(float)) ); CUDA_CALL( cudaMemset(d_col[i], 0, R * S * C * OH * OW * sizeof(float)) ); } for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaFree(d_input[i]) ); CUDA_CALL( cudaFree(d_filter[i]) ); CUDA_CALL( cudaFree(d_output[i]) ); CUDA_CALL( cudaFree(d_col[i]) ); CUDA_CALL( cudaFreeHost(h_input[i]) ); CUDA_CALL( cudaFreeHost(h_output[i]) ); CUDA_CALL( cudaStreamDestroy(stream[i]) ); } } __global__ void im2col_gpu_kernel(const int n, const float *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad, const int stride, const int dilation, const int height_col, const int width_col, float* data_col) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < n; index += blockDim.x * gridDim.x) { const int h_index = index / width_col; const int h_col = h_index % height_col; const int w_col = index % width_col; const int c_im = h_index / height_col; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col * stride - pad; const int w_offset = w_col * stride - pad; float *data_col_ptr = data_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; const float *data_im_ptr = data_im; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { int h_im = h_offset + i * dilation; int w_im = w_offset + j * dilation; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation * width + j * dilation] : 0; data_col_ptr += height_col * width_col; } } } } __global__ void sgemm_gpu_kernel(float *A, float *B, float *C, int M, int N, int K) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int aBegin = K * SGEMM_BLOCKSIZE * by; int aEnd = aBegin + K - 1; int aStep = SGEMM_BLOCKSIZE; int bBegin = SGEMM_BLOCKSIZE * bx; int bStep = SGEMM_BLOCKSIZE * N; float Csub = 0; for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { __shared__ float As[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE]; __shared__ float Bs[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE]; As[ty][tx] = A[a + K * ty + tx]; Bs[ty][tx] = B[b + N * ty + tx]; __syncthreads(); #pragma unroll for (int k = 0; k < SGEMM_BLOCKSIZE; ++k) { Csub += As[ty][k] * Bs[k][tx]; } __syncthreads(); } if (bx * SGEMM_BLOCKSIZE + tx < N) { int c = N * SGEMM_BLOCKSIZE * by + SGEMM_BLOCKSIZE * bx; C[c + N * ty + tx] = Csub; } }