#include "convolution.h" #include "util.h" #include #include #include #include #define CUDA_CALL(f) \ { \ cudaError_t err = (f); \ if (err != cudaSuccess) { \ fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \ err, cudaGetErrorString(err)); \ exit(1); \ } \ } #define MAX_NUM_CPU 2 #define MAX_NUM_GPU 4 #define TS 8 static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int ns[MAX_NUM_CPU], ne[MAX_NUM_CPU]; static float *input_d[MAX_NUM_GPU]; static float *filter_d[MAX_NUM_GPU]; static float *output_d[MAX_NUM_GPU]; static int Nbegin[MAX_NUM_CPU][MAX_NUM_GPU], Nend[MAX_NUM_CPU][MAX_NUM_GPU]; int num_devices = 0; __global__ void convolution_thread( float *input, float *output, float *filter, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) { //printf("blockIdx.x:%d\n", blockIdx.x); //printf("blockDim.x:%d\n", blockDim.x); //printf("threadIdx.x:%d\n", threadIdx.x); //printf("blockIdx.y:%d\n", blockIdx.y); #if 0 int oh = (blockIdx.x * blockDim.x + threadIdx.x); int ow = (blockIdx.y * blockDim.y + threadIdx.y); //int k = (blockIdx.z * blockDim.z + threadIdx.z) % K; //int n = (blockIdx.z * blockDim.z + threadIdx.z) / K; int k = (blockIdx.z) % K; int n = (blockIdx.z) / K; #else int ow = threadIdx.x; int oh = blockIdx.x; int k = blockIdx.y; int n = blockIdx.z; //int ow = threadIdx.x; //int oh = blockIdx.z; //int k = blockIdx.y; //int n = blockIdx.x; #endif if (oh >= OH || ow >= OW || k >= K || n >= N) return; //__shared__ int *filter_d; //__syncthreads(); // float o = 0.f; for (int c = 0; c < C; ++c) { for (int r = 0; r < R; ++r) { int h = oh * stride - pad + r * dilation; for (int s = 0; s < S; ++s) { int w = ow * stride - pad + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter[k * C * R * S + c * R * S + r * S + s]; //printf("input[n * C * H * W + c * H * W + h * W + w] = %f\n", i); //printf("filter[k * C * R * S + c * R * S + r * S + s] = %f\n", f); o += i * f; } } } output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; //printf("output[n * K * OH * OW + k * OH * OW + oh * OW + ow]= %f\n", o); } void convolution ( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; if (mpi_rank != 0) { alloc_tensor(&input, N, C, H, W); alloc_tensor(&output, N, K, OH, OW); alloc_tensor(&filter, K, C, R, S); } if (mpi_world_size > 1) { if (mpi_rank == 0) { for (int i = 1; i < mpi_world_size; i++) { MPI_Send(input + ns[i] * C * H * W, (ne[i] - ns[i]) * C * H * W, MPI_FLOAT, i, 0, MPI_COMM_WORLD); MPI_Send(filter, K * C * R * S, MPI_FLOAT, i, 0, MPI_COMM_WORLD); } } else { MPI_Recv(input + ns[mpi_rank] * C * H * W, (ne[mpi_rank] - ns[mpi_rank]) * C * H * W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, nullptr); MPI_Recv(filter, K * C * R * S, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, nullptr); } } //printf("ns[%d] : %d\n", mpi_rank, ns[mpi_rank]); //printf("ne[%d] : %d\n", mpi_rank, ne[mpi_rank]); //for(int i = 0; i < num_devices; i++) //{ // printf("Nbegin[%d][%d] : %d\n", mpi_rank,i,Nbegin[mpi_rank][i]); // printf("Nend[%d][%d] : %d\n", mpi_rank,i,Nend[mpi_rank][i]); //} // Cuda start for (int i = 0; i < num_devices; i++) { if(Nend[mpi_rank][i] - Nbegin[mpi_rank][i] != 0) { //printf("Nend[mpi_rank][%d]:%d\n", i, Nend[mpi_rank][i]); //printf("Nbegin[mpi_rank][%d]:%d\n", i, Nbegin[mpi_rank][i]); //printf("%d, %d\n",ns[mpi_rank], Nbegin[mpi_rank][i]); CUDA_CALL( cudaMemcpy(input_d[i], input + (ns[mpi_rank] + Nbegin[mpi_rank][i]) * C * H * W, (Nend[mpi_rank][i] - Nbegin[mpi_rank][i]) * C * H * W * sizeof(float), cudaMemcpyHostToDevice) ); CUDA_CALL( cudaMemcpy(filter_d[i], filter, K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) ); } } for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } for (int i = 0; i < num_devices; i++) { //dim3 blockDim(OH, OW); //dim3 gridDim(Nend[mpi_rank][i] - Nbegin[mpi_rank][i], K); //printf("test[%d]\n", mpi_rank); if(Nend[mpi_rank][i] - Nbegin[mpi_rank][i] != 0) { #if 0 dim3 blockDim(TS, TS); dim3 gridDim((OH + TS - 1) / TS, (OW + TS - 1)/ TS, ((Nend[mpi_rank][i] - Nbegin[mpi_rank][i]) * K)); //printf("Nend[mpi_rank][%d]:%d\n", i, Nend[mpi_rank][i]); //printf("Nbegin[mpi_rank][%d]:%d\n", i, Nbegin[mpi_rank][i]); #else dim3 blockDim(OW); dim3 gridDim(OH, K, Nend[mpi_rank][i] - Nbegin[mpi_rank][i]); //dim3 gridDim(Nend[mpi_rank][i] - Nbegin[mpi_rank][i], K, OH); #endif CUDA_CALL( cudaSetDevice(i) ); convolution_thread<<>>(input_d[i], output_d[i], filter_d[i], (Nend[mpi_rank][i] - Nbegin[mpi_rank][i]), _C, _H, _W, _K, _R, _S, _pad, _dilation, _stride, OH, OW); } } for (int i = 0; i < num_devices; i++) { CUDA_CALL( cudaDeviceSynchronize() ); } for (int i = 0; i < num_devices; i++) { if(Nend[mpi_rank][i] - Nbegin[mpi_rank][i] != 0) { CUDA_CALL( cudaMemcpy(output + (ns[mpi_rank] + Nbegin[mpi_rank][i]) * K * OH * OW, output_d[i], (Nend[mpi_rank][i] - Nbegin[mpi_rank][i]) * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) ); } } // Cuda end if (mpi_world_size > 1) { if (mpi_rank == 0) { for (int i = 1; i