#include "util.h" #include static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int filter_size, im_size, col_size, res_size; #define MAX_NODES 2 static int ns[MAX_NODES], ne[MAX_NODES]; #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define TS (32) #define WPT (16) #define RTS (2) #define MAX_NUM_GPU 4 static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU]; int num_devices = 0; // Array of device (GPU) pointers static float *input_d[MAX_NUM_GPU]; static float *input_col_d[MAX_NUM_GPU]; static float *filter_d[MAX_NUM_GPU]; static float *output_d[MAX_NUM_GPU]; __global__ void im2col_gpu_kernel(const int n, const float* data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int height_col, const int width_col, float* data_col) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; \ index < (n); \ index += blockDim.x * gridDim.x) { const int h_index = index / width_col; const int h_col = h_index % height_col; const int w_col = index % width_col; const int c_im = h_index / height_col; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col * stride_h - pad_h; const int w_offset = w_col * stride_w - pad_w; float* data_col_ptr = data_col; data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; const float* data_im_ptr = data_im; data_im_ptr += (c_im * height + h_offset) * width + w_offset; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { int h_im = h_offset + i * dilation_h; int w_im = w_offset + j * dilation_w; *data_col_ptr = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; data_col_ptr += height_col * width_col; } } } } __global__ void sgemm(float *A, float *B, float *C, int M, int N, int K, int numTiles) { const int row = threadIdx.x; const int col = threadIdx.y; const int globalRow = TS * blockIdx.x + threadIdx.x; const int globalCol = TS * blockIdx.y + threadIdx.y; __shared__ float Asub[TS][TS]; __shared__ float Bsub[TS][TS]; float acc[WPT]; for(int w=0; w>>( num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col); } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; if(mpi_rank != 0) { alloc_tensor(&input, N, C, H, W); alloc_tensor(&output, N, K, OH, OW); alloc_tensor(&filter, K, C, R, S); } // Scatter input if(mpi_rank == 0) { for(int i=1; i < mpi_world_size; i++) { MPI_Send(input + ns[i]*im_size, (ne[i]-ns[i])*im_size, MPI_FLOAT, i, 0, MPI_COMM_WORLD); } } else { MPI_Recv(input + ns[mpi_rank]*im_size, (ne[mpi_rank]-ns[mpi_rank])*im_size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, nullptr); } // Broadcast filter MPI_Bcast(filter, filter_size, MPI_FLOAT, 0, MPI_COMM_WORLD); // Upload A and B matrix to every GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaMemcpy(input_d[i], input+(ns[mpi_rank]+Mbegin[i])*im_size, (Mend[i]-Mbegin[i])*im_size*sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(filter_d[i], filter, filter_size*sizeof(float), cudaMemcpyHostToDevice) ); } int M_XL = (K+TS-1)/TS*TS; int N_XL = (OH*OW+TS-1)/TS*TS; int K_XL = (C*R*S+TS-1)/TS*TS; int numTiles = K_XL/TS; dim3 blockDim(RTS, TS, 1); dim3 gridDim(M_XL/TS, N_XL/TS, 1); // Launch kernel on every GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); for(int j = 0; j < Mend[i]-Mbegin[i]; j++) { im2col_gpu(input_d[i]+j*im_size, C, H, W, R, S, pad, pad, stride, stride, dilation, dilation, input_col_d[i]); sgemm<<>>(filter_d[i], input_col_d[i], output_d[i]+j*res_size, K, OH*OW, C*R*S, numTiles); } } // Download C matrix from GPUs for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaMemcpy(output + (ns[mpi_rank]+Mbegin[i])*res_size, output_d[i], (Mend[i]-Mbegin[i])*res_size*sizeof(float), cudaMemcpyDeviceToHost) ); } // Gather C if (mpi_rank == 0) { for (int i = 1; i < mpi_world_size; i++) { MPI_Recv(output + ns[i]*res_size, (ne[i]-ns[i])*res_size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, nullptr); } } else { MPI_Send(output + ns[mpi_rank]*res_size, (ne[mpi_rank]-ns[mpi_rank])*res_size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; CUDA_CALL( cudaGetDeviceCount(&num_devices) ); //printf("Using %d devices\n", num_devices); for (int i = 0; i < num_devices; i++) { cudaDeviceProp prop; CUDA_CALL( cudaGetDeviceProperties(&prop, i) ); // Try printing more detailed information here //printf("[GPU %d] %s\n", i, prop.name); } if (num_devices <= 0) { printf("No CUDA device found. Aborting\n"); exit(1); } for(int i=0; i < mpi_world_size; i++) { ns[i] = N/mpi_world_size *i; ne[i] = N/mpi_world_size*(i+1); } ne[mpi_world_size-1] = N; // Setup problem size for each GPU int batch_size = ne[mpi_rank]-ns[mpi_rank]; for (int i = 0; i < num_devices; i++) { Mbegin[i] = (batch_size / num_devices) * i; Mend[i] = (batch_size / num_devices) * (i + 1); } Mend[num_devices - 1] = batch_size; im_size = C*H*W; col_size = C*R*S*OH*OW; res_size = K*OH*OW; filter_size = K*C*R*S; // Allocate device memory for each GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMalloc(&input_d[i], (Mend[i]-Mbegin[i])*im_size*sizeof(float)) ); CUDA_CALL( cudaMalloc(&input_col_d[i], col_size*sizeof(float)) ); CUDA_CALL( cudaMalloc(&filter_d[i], filter_size*sizeof(float)) ); CUDA_CALL( cudaMalloc(&output_d[i], (Mend[i]-Mbegin[i])*res_size*sizeof(float)) ); } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { }