#include "convolution.h" #include #include "util.h" #include #include #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define MAX_NUM_NODE 2 #define MAX_NUM_GPU 4 int num_devices = 0; #define TS 32 __global__ void cuda_conv( float *input, float *output, float *filter, int N, int C, int H, int W, int K, int R, int S, int OH, int OW, int pad, int dilation, int stride) { int x = threadIdx.x; int y = threadIdx.y; int global_x = blockDim.x * blockIdx.x + x; int global_y = blockDim.y * blockIdx.y + y; if (global_x >= N * OW || global_y >= K * OH) return; // boundary check int n = global_x / OW; int ow = global_x % OW; int k = global_y / OH; int oh = global_y % OH; float o = 0.f; for (int c = 0; c < C; ++c) { for (int r = 0; r < R; ++r) { int h = oh * stride - pad + r * dilation; if (h < 0 || h >= H) continue; for (int s = 0; s < S; ++s) { int w = ow * stride - pad + s * dilation; if (w < 0 || w >= W) continue; float i = input[(n * C * H * W) + (c * H * W) + (h * W) + w]; float f = filter[(k * C * R * S) + (c * R * S) + (r * S) + s]; o += i * f; } } } output[(n * K * OH * OW) + (k * OH * OW) + (oh * OW) + ow] = o; } static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; int get_size_per_rank(int rank) { const int NN = N / mpi_world_size; if (rank == -1) return 0; else if (rank != 0) return NN; else return (N - (mpi_world_size - 1) * NN); } int get_begin_index(int rank) { if (rank == 0) return 0; else if (rank == 1) return get_size_per_rank(0); else return get_begin_index(rank - 1) + get_size_per_rank(rank - 1); } // Array of device (GPU) pointers static float *a_d[MAX_NUM_NODE][MAX_NUM_GPU]; static float *b_d[MAX_NUM_NODE][MAX_NUM_GPU]; static float *c_d[MAX_NUM_NODE][MAX_NUM_GPU]; static int Nbegin[MAX_NUM_NODE][MAX_NUM_GPU], Nend[MAX_NUM_NODE][MAX_NUM_GPU]; static int Nsize[MAX_NUM_NODE][MAX_NUM_GPU]; void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; if (mpi_rank != 0) { alloc_tensor(&input, N, C, H, W); alloc_tensor(&output, N, K, OH, OW); alloc_tensor(&filter, K, C, R, S); } MPI_Barrier(MPI_COMM_WORLD); MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD); if (mpi_rank == 0) { for (int i = 1; i < mpi_world_size; ++i) { MPI_Request request; int index = get_begin_index(i) * C * H * W; int size = get_size_per_rank(i) * C * H * W; MPI_Isend(&input[index], size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &request); } } else { MPI_Request request; int index = get_begin_index(mpi_rank) * C * H * W; int size = get_size_per_rank(mpi_rank) * C * H * W; MPI_Irecv(&input[index], size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request); zero_tensor(output, N, K, OH, OW); MPI_Wait(&request, MPI_STATUS_IGNORE); } // Upload A and B matrix to every GPU for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMemcpy(a_d[mpi_rank][i], &input[Nbegin[mpi_rank][i] * C * H * W], Nsize[mpi_rank][i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(b_d[mpi_rank][i], filter, K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) ); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } // Launch kernel on every GPU for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; int gws[3] = {Nsize[mpi_rank][i] * OW, K * OH, 1}; int lws[3] = {TS, TS, 1}; for (int j = 0; j < 3; ++j) { gws[j] = (gws[j] + lws[j] - 1) / lws[j] * lws[j]; } dim3 blockDim(lws[0], lws[1], lws[2]); dim3 gridDim(gws[0] / lws[0], gws[1] / lws[1], gws[2] / lws[2]); CUDA_CALL( cudaSetDevice(i) ); cuda_conv<<>>(a_d[mpi_rank][i], c_d[mpi_rank][i], b_d[mpi_rank][i], Nsize[mpi_rank][i], C, H, W, K, R, S, OH, OW, pad, dilation, stride); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } // Download C matrix from GPUs for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaMemcpy(output + Nbegin[mpi_rank][i] * K * OH * OW, c_d[mpi_rank][i], Nsize[mpi_rank][i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) ); } // DO NOT REMOVE; NEEDED FOR TIME MEASURE for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaDeviceSynchronize() ); } if (mpi_rank == 0) { MPI_Request request[mpi_world_size]; for (int i = 1; i < mpi_world_size; ++i) { int index = get_begin_index(i) * K * OH * OW; int size = get_size_per_rank(i) * K * OH * OW; MPI_Irecv(&output[index], size, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &request[i]); } for (int i = 1; i < mpi_world_size; ++i) { MPI_Wait(&request[i], MPI_STATUS_IGNORE); } } else { int index = get_begin_index(mpi_rank) * K * OH * OW; int size = get_size_per_rank(mpi_rank) * K * OH * OW; MPI_Send(&output[index], size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD); } } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); CUDA_CALL( cudaGetDeviceCount(&num_devices) ); if (num_devices > MAX_NUM_GPU) num_devices = MAX_NUM_GPU; printf("Using %d devices\n", num_devices); for (int i = 0; i < num_devices; i++) { cudaDeviceProp prop; CUDA_CALL( cudaGetDeviceProperties(&prop, i) ); // Try printing more detailed information here printf("[GPU %d] %s\n", i, prop.name); } if (num_devices <= 0) { printf("No CUDA device found. Aborting\n"); exit(1); } OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; // Setup problem size for each GPU int NN = get_size_per_rank(mpi_rank); int NNbegin = get_begin_index(mpi_rank); for (int i = 0; i < num_devices; i++) { Nbegin[mpi_rank][i] = (NN / num_devices) * i + NNbegin; Nend[mpi_rank][i] = (NN / num_devices) * (i + 1) + NNbegin; Nsize[mpi_rank][i] = Nend[mpi_rank][i] - Nbegin[mpi_rank][i]; } Nend[mpi_rank][num_devices - 1] = NN + NNbegin; Nsize[mpi_rank][num_devices - 1] = Nend[mpi_rank][num_devices - 1] - Nbegin[mpi_rank][num_devices - 1]; // Allocate device memory for each GPU for (int i = 0; i < num_devices; i++) { if (Nsize[mpi_rank][i] == 0) continue; CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMalloc(&a_d[mpi_rank][i], Nsize[mpi_rank][i] * C * H * W * sizeof(float)) ); CUDA_CALL( cudaMalloc(&b_d[mpi_rank][i], K * C * R * S * sizeof(float)) ); CUDA_CALL( cudaMalloc(&c_d[mpi_rank][i], Nsize[mpi_rank][i] * K * OH * OW * sizeof(float)) ); } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { }