#include "convolution.h" #include #include #include #include #include static float *input, *output, *filter; static int N, C, H, W; static int K, R, S; static int OH, OW; static int pad; static int dilation; static int stride; static int mpi_rank, mpi_world_size; static int my_min(int x, int y) { return x < y ? x : y; } void my_alloc_tensor(float **t, int D0, int D1, int D2, int D3) { *t = (float *) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3); if (*t == NULL) { printf("Failed to allocate memory for matrix.\n"); exit(0); } } int ns[4]; int ne[4]; int ncounts[4]; #define NUM_THREADS 40 #define TSR 4 #define TSS 16 void convolution( float *_input, float *_output, float *_filter, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride) { input = _input; output = _output; filter = _filter; MPI_Status status; MPI_Request request; OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1; OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1; for(int idx=0; idx= W) { local_input[0][s] = 0; local_input[1][s] = 0; local_input[2][s] = 0; local_input[3][s] = 0; } else { if (h0 < 0 || h0 >= H) local_input[0][s] = 0; else local_input[0][s] = input[n * C * H * W + c * H * W + h0 * W + w]; if (h1 < 0 || h1 >= H) local_input[1][s] = 0; else local_input[1][s] = input[n * C * H * W + c * H * W + h1 * W + w]; if (h2 < 0 || h2 >= H) local_input[2][s] = 0; else local_input[2][s] = input[n * C * H * W + c * H * W + h2 * W + w]; if (h3 < 0 || h3 >= H) local_input[3][s] = 0; else local_input[3][s] = input[n * C * H * W + c * H * W + h3 * W + w]; } } //case1 __m512 i0 = _mm512_loadu_ps(&local_input[0][0]); __m512 i1 = _mm512_loadu_ps(&local_input[1][0]); __m512 i2 = _mm512_loadu_ps(&local_input[2][0]); __m512 i3 = _mm512_loadu_ps(&local_input[3][0]); __m512 f0 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+0) * S + 0]); __m512 f1 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+1) * S + 0]); __m512 f2 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+2) * S + 0]); __m512 f3 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+3) * S + 0]); __m512 to0 = _mm512_mul_ps(i0, f0); __m512 to1 = _mm512_mul_ps(i1, f1); __m512 to2 = _mm512_mul_ps(i2, f2); __m512 to3 = _mm512_mul_ps(i3, f3); o += _mm512_reduce_add_ps(to0); o += _mm512_reduce_add_ps(to1); o += _mm512_reduce_add_ps(to2); o += _mm512_reduce_add_ps(to3); } } else { for (int r = 0; r < R; ++r) { int h = h_start + r * dilation; for (int s = 0; s < S; ++s) { int w = w_start + s * dilation; if (h < 0 || h >= H || w < 0 || w >= W) continue; float i = input[n * C * H * W + c * H * W + h * W + w]; float f = filter[k * C * R * S + c * R * S + r * S + s]; o += i * f; } } } } output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o; } } } } //send output if (mpi_rank == 0) { for(int proc=1; proc