#include "convolution.h" #include #include #include #include "util.h" #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define MAX_NUM_NODE 2 #define MAX_NUM_GPU 4 #define SGEMM_BSIZE 32 #define IM2COL_NTHREADS 1024 /* Variables for MPI */ static int num_devices; static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad, dilation, stride; static int mpi_rank, mpi_world_size; static int wbegin[MAX_NUM_NODE], wend[MAX_NUM_NODE]; static int mbegin[MAX_NUM_GPU], mend[MAX_NUM_GPU]; static int rounded_M, rounded_N; /* Variables for GPU devices' */ static float *h_input[MAX_NUM_GPU]; static float *h_output[MAX_NUM_GPU]; static float *d_input[MAX_NUM_GPU]; static float *d_filter[MAX_NUM_GPU]; static float *d_col[MAX_NUM_GPU]; static float *d_output[MAX_NUM_GPU]; static cudaStream_t stream[MAX_NUM_GPU]; // cuda stream /* GPU kernel */ __global__ void gpu_im2col(const int n, const float *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad, const int stride, const int dilation, const int height_col, const int width_col, float* data_col); __global__ void gpu_sgemm(float *A, float *B, float *C, int M, int N, int K); static void convolution_gpu (void) { int n; int im2col_kernels = C * OH * OW; int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS; dim3 sgemm_blockDim(SGEMM_BSIZE, SGEMM_BSIZE); dim3 sgemm_gridDim(rounded_N/SGEMM_BSIZE, rounded_M/SGEMM_BSIZE); for (int i=0; i= (mend[i] - mbegin[i])) { continue; } CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMemcpyAsync(d_input[i], h_input[i] + n * C * H * W, C * H * W * sizeof(float), cudaMemcpyHostToDevice, stream[i]) ); gpu_im2col<<>>( im2col_kernels, d_input[i], H, W, R, S, pad, stride, dilation, OH, OW, d_col[i]); gpu_sgemm<<>>( d_filter[i], d_col[i], d_output[i], K, OH * OW, R * S * C); CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW, d_output[i], K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) ); } } int d = num_devices - 1; for (n=n; n<(mend[d] - mbegin[d]); n++) { CUDA_CALL( cudaSetDevice(d) ); CUDA_CALL( cudaMemcpyAsync(d_input[d], h_input[d] + n * C * H * W, C * H * W * sizeof(float), cudaMemcpyHostToDevice, stream[d]) ); gpu_im2col<<>>( im2col_kernels, d_input[d], H, W, R, S, pad, stride, dilation, OH, OW, d_col[d]); gpu_sgemm<<>>( d_filter[d], d_col[d], d_output[d], K, OH * OW, R * S * C); CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW, d_output[d], K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost, stream[d]) ); } for (int i=0; i 0) { convolution_gpu(); } /* Gather */ if (mpi_rank == 0) { for (int i=1; i= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation * width + j * dilation] : 0; data_col_ptr += height_col * width_col; } } } } __global__ void gpu_sgemm(float *A, float *B, float *C, int M, int N, int K) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int aBegin = K * SGEMM_BSIZE * by; int aEnd = aBegin + K - 1; int aStep = SGEMM_BSIZE; int bBegin = SGEMM_BSIZE * bx; int bStep = SGEMM_BSIZE * N; float Csub = 0; for (int a=aBegin, b=bBegin; a <= aEnd; a += aStep, b += bStep) { __shared__ float As[SGEMM_BSIZE][SGEMM_BSIZE]; __shared__ float Bs[SGEMM_BSIZE][SGEMM_BSIZE]; As[ty][tx] = A[a + K * ty + tx]; Bs[ty][tx] = B[b + N * ty + tx]; __syncthreads(); #pragma unroll for (int k = 0; k < SGEMM_BSIZE; ++k) { Csub += As[ty][k] * Bs[k][tx]; } __syncthreads(); } if (bx * SGEMM_BSIZE + tx < N) { int c = N * SGEMM_BSIZE * by + SGEMM_BSIZE * bx; C[c + N * ty + tx] = Csub; } }