#include "convolution.h" #include "util.h" #include #include #include static float *input, *input_col, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; #define MAX_NODES (2) static int ns[MAX_NODES], ne[MAX_NODES]; inline int min(int a, int b) { return a < b ? a : b; } #define ITILESIZE (4) #define JTILESIZE (4096) #define KTILESIZE (4096) void mat_mul_omp(const float *A, const float *B, const int MM, const int NN, const int KK, float *C) { #pragma omp parallel for num_threads(20) schedule(dynamic) for (int ii = 0; ii < MM; ii += ITILESIZE) { for (int jj = 0; jj < NN; jj += JTILESIZE) { for (int kk = 0; kk < KK; kk += KTILESIZE) { for (int k = kk; k < min(KK, kk + KTILESIZE); k++) { for (int i = ii; i < min(MM, ii + ITILESIZE); i++) { float ar = A[i * KK + k]; for (int j = jj; j < min(NN, jj + JTILESIZE); j++) { C[i * NN + j] += ar * B[k * NN + j]; } } } } } } } void im2col_cpu(const float* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, float* data_col) { int channels_col = channels * kernel_h * kernel_w; #pragma omp parallel for num_threads(20) schedule(dynamic) for (int c = 0; c < channels_col; ++c) { int w_offset = c % kernel_w; int h_offset = (c / kernel_w) % kernel_h; int c_im = c / kernel_h / kernel_w; const int hc0 = h_offset * dilation_h - pad_h; const int wc0 = w_offset * dilation_w - pad_w; for (int h = 0; h < OH; ++h) { int h_pad = h * stride_h + hc0; const int row_offset = (c * OH + h) * OW; const int srow_offset = (c_im * height + h_pad) * width; for (int w = 0; w < OW; ++w) { int w_pad = w * stride_w + wc0; if ((((unsigned)h_pad) < ((unsigned)height)) && (((unsigned)w_pad) < ((unsigned)width))) data_col[row_offset + w] = data_im[srow_offset + w_pad]; else { data_col[row_offset + w] = 0.; } } } } } void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; if(mpi_rank != 0) { alloc_tensor(&input, N, C, H, W); alloc_tensor(&output, N, K, OH, OW); alloc_tensor(&filter, K, C, R, S); zero_tensor(output, N, K, OH, OW); } // Scatter input if(mpi_rank == 0) { for(int i=1; i < mpi_world_size; i++) { MPI_Send(input + ns[i]*C*H*W, (ne[i]-ns[i])*C*H*W, MPI_FLOAT, i, 0, MPI_COMM_WORLD); } } else { MPI_Recv(input + ns[mpi_rank]*C*H*W, (ne[mpi_rank]-ns[mpi_rank])*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, nullptr); } // Broadcast filter MPI_Bcast(filter, K*C*R*S, MPI_FLOAT, 0, MPI_COMM_WORLD); for(int n=ns[mpi_rank]; n