#include "convolution.h" #include #include #include #include #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define TS 32 #define MAX_NUM_GPU 4 int num_devices = 0; static float *input, *output, *filter; static int N, C, H, W; static int N_mpi_aware; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static float * input_d[MAX_NUM_GPU]; static float * filter_d[MAX_NUM_GPU]; static float * output_d[MAX_NUM_GPU]; static int Nbegin[MAX_NUM_GPU]; static int Nend[MAX_NUM_GPU]; static int input_size; static int input_middle; static int filter_size; static int output_size; static int output_middle; __global__ void conv( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride, int OH, int OW) { const int globalRow = blockDim.x * blockIdx.x + threadIdx.x; const int globalCol = blockDim.y * blockIdx.y + threadIdx.y; //int OH = (_H + 2 * _pad - _dilation * (_R - 1) - 1) / _stride + 1; //int OW = (_W + 2 * _pad - _dilation * (_S - 1) - 1) / _stride + 1; int n, k, w; w = globalCol; n = w / (_K * OW); w = w - n * (_K * OW); k = w / OW; w = w - k * OW; int col = w; int row = globalRow; if(globalRow >= OH || globalCol >= _N * _K * OW) return; int start_row = row * _stride - _pad; int start_col = col * _stride - _pad; float o = 0.0f; for(int c = 0; c < _C; c++){ for(int i = 0; i <_R; i++){ for(int j = 0; j < _S; j++){ int h = start_row + i * _dilation; int w = start_col + j * _dilation; if(h < 0 || w < 0 || h >= _H || w >= _W) continue; float in = _input[n*_C*_W*_H + c*_W*_H + h*_W + w]; float fil = _filter[k*_C*_R*_S + c*_R*_S + i*_S + j]; o += in * fil; } } } _output[n*_K*OH*OW + k*OH*OW + row*OW + col] = o; } void convolution_gpu( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { output = _output; //printf("\n\n\n\nhead of convolution_gpu %d\n\n\n\n\n", mpi_rank); //int OH = (_H + 2 * _pad - _dilation * (_R - 1) - 1) / _stride + 1; //int OW = (_W + 2 * _pad - _dilation * (_S - 1) - 1) / _stride + 1; for (int i = 0; i < num_devices; i++) { //CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMemcpy(input_d[i], _input + Nbegin[i]*_C*_H*_W, (Nend[i] - Nbegin[i])*_C*_H*_W* sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(filter_d[i], _filter, _K*_C*_R*_S*sizeof(float), cudaMemcpyHostToDevice) ); } for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } dim3 blockDim(TS, TS, 1); for(int i = 0; i < num_devices; i++){ dim3 gridDim((OH + TS + 1)/TS, ((Nend[i] - Nbegin[i])*_K*OW + TS - 1)/TS, 1); CUDA_CALL( cudaSetDevice(i) ); conv<<>>(input_d[i], output_d[i], filter_d[i], Nend[i] - Nbegin[i], _C, _H, _W, _K ,_R, _S, _pad, _dilation, _stride, OH, OW); } } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { MPI_Request mpi_request; MPI_Status mpi_status; //N_mpi_aware = N; if(mpi_rank == 0){ input = _input; output = _output; filter = _filter; if(mpi_world_size == 2){ MPI_Isend(input + input_middle, input_size - input_middle, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &mpi_request); MPI_Isend(filter, filter_size, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &mpi_request); //N_mpi_aware = N/2; //printf("0 N_size : %d\n", N_size); } } else{ input = (float *) aligned_alloc(32, sizeof(float) * (input_size - input_middle)); filter = (float *) aligned_alloc(32, sizeof(float) * filter_size); output = (float *) aligned_alloc(32, sizeof(float) * (output_size - output_middle)); MPI_Recv(input, input_size - input_middle, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &mpi_status); MPI_Recv(filter, filter_size, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &mpi_status); //N_mpi_aware = N - N/2; //printf("1 N_size : %d\n", N_size); } //printf("call core : %d\n", N_size); convolution_gpu( input, output, filter, N_mpi_aware, C, H, W, K, R, S, pad, dilation, stride); //printf("core finished\n"); } void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { N = _N; C = _C; H = _H; W = _W; K = _K; R = _R; S = _S; pad = _pad; dilation = _dilation; stride = _stride; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size); OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; input_size = N * C * H * W; input_middle = (N/2) * C * H * W; filter_size = K * C * R * S; output_size = N * K * OH * OW; output_middle = (N/2) * K * OH * OW; CUDA_CALL( cudaGetDeviceCount(&num_devices) ); for (int i = 0; i < num_devices; i++) { cudaDeviceProp prop; CUDA_CALL( cudaGetDeviceProperties(&prop, i) ); } if (num_devices <= 0) { printf("No CUDA device found. Aborting\n"); exit(1); } if(mpi_world_size == 2){ if(mpi_rank == 0){ N_mpi_aware = N/2; // printf("N_mpi_aware %d : %d\n", mpi_rank, N_mpi_aware); } else{ N_mpi_aware = N - N/2; // printf("N_mpi_aware %d : %d\n", mpi_rank, N_mpi_aware); } } else{ N_mpi_aware = N; //printf("mpi_world_size 1, N_mpi_aware %d : %d\n", mpi_rank, N_mpi_aware); } if(num_devices > N_mpi_aware){ num_devices = 1; Nbegin[0] = 0; Nend[0] = N_mpi_aware; } else{ // Setup problem size for each GPU for (int i = 0; i < num_devices; i++) { Nbegin[i] = (N_mpi_aware / num_devices) * i; Nend[i] = (N_mpi_aware / num_devices) * (i + 1); } Nend[num_devices - 1] = N_mpi_aware; } //// debug print //for(int i = 0; i < num_devices; i++){ // printf("%d : Nbegin[%d] == %d / Nend[%d] == %d\n", mpi_rank, i, Nbegin[i], i, Nend[i]); //} // Allocate device memory for each GPU for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaSetDevice(i) ); CUDA_CALL( cudaMalloc(&input_d[i], (Nend[i] - Nbegin[i])*C*H*W* sizeof(float)) ); CUDA_CALL( cudaMalloc(&filter_d[i], filter_size*sizeof(float)) ); CUDA_CALL( cudaMalloc(&output_d[i], (Nend[i] - Nbegin[i])*K*OH*OW*sizeof(float)) ); //printf("%d : cuda malloc %d\n", mpi_rank, i); } for(int i = 0; i < num_devices; i++){ CUDA_CALL( cudaDeviceSynchronize() ); } } void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { //if(mpi_rank == 1) return; //OH = (_H + 2 * _pad - _dilation * (_R - 1) - 1) / _stride + 1; //OW = (_W + 2 * _pad - _dilation * (_S - 1) - 1) / _stride + 1; MPI_Request mpi_request; MPI_Status mpi_status; for(int i = 0; i < num_devices; i++){ CUDA_CALL( cudaMemcpy(output + Nbegin[i]*K*OH*OW, output_d[i], (Nend[i] - Nbegin[i])*K*OH*OW*sizeof(float), cudaMemcpyDeviceToHost) ); } for(int i = 0; i < num_devices; i++){ CUDA_CALL( cudaDeviceSynchronize() ); } if(mpi_world_size == 2){ if(mpi_rank == 0){ //printf("rank 0 recv : %d\n", output_size - output_middle); MPI_Recv(output + output_middle, output_size - output_middle, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &mpi_status); } else{ //printf("rank 1 send : %d\n", output_size - output_middle); MPI_Isend(output, output_size - output_middle, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &mpi_request); } } }