chundoong-lab-ta/SamsungDS22/submissions/final/yh1597.yang/A/convolution.cpp

#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <CL/cl.h>
#include <cuda_runtime.h>
#include <immintrin.h>


static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;

static int my_min(int x, int y) {
  return x < y ? x : y;
}

void my_alloc_tensor(float **t, int D0, int D1, int D2, int D3) {
  *t = (float *) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3);
  if (*t == NULL) {
    printf("Failed to allocate memory for matrix.\n");
    exit(0);
  }
}

int ns[4];
int ne[4];
int ncounts[4];

#define NUM_THREADS 40
#define TSR 4
#define TSS 16

void convolution(
    float *_input, float *_output, float *_filter, 
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
  input = _input;
  output = _output;
  filter = _filter;

  MPI_Status status;
  MPI_Request request;

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  for(int idx=0; idx<mpi_world_size; idx++) {
    ns[idx] = N / mpi_world_size * idx + my_min(idx, N % mpi_world_size);
    if(idx == (mpi_world_size-1))
      ne[idx] = N;
    else
      ne[idx] = N / mpi_world_size * (idx + 1) + my_min(idx + 1, N % mpi_world_size);
    ncounts[idx] = ne[idx]-ns[idx];
  }
  //memory allocation
  if(mpi_rank != 0) {
    my_alloc_tensor(&input, N, C, H, W);
    my_alloc_tensor(&output, N, K, OH, OW);
    my_alloc_tensor(&filter, K, C, R, S);
  }
  //send input
  if (mpi_rank == 0) {
    for(int proc=1; proc<mpi_world_size; proc++)
      if(ncounts[proc] != 0)
        MPI_Isend(input+ns[proc]*C*H*W, ncounts[proc]*C*H*W ,MPI_FLOAT, proc/*dst*/, proc /*tag*/, MPI_COMM_WORLD, &request);
  }
  else {
    if(ncounts[mpi_rank] != 0)
      MPI_Recv(input, ncounts[mpi_rank]*C*H*W, MPI_FLOAT, 0 /*src*/, mpi_rank /*tag*/, MPI_COMM_WORLD, &status);
  }
  //send filter
  MPI_Bcast(filter, K*C*R*S, MPI_FLOAT, 0 /*src*/, MPI_COMM_WORLD);
#pragma omp parallel for num_threads(NUM_THREADS) collapse(3) schedule(dynamic)
    for (int n = 0; n < ncounts[mpi_rank]; ++n) {
      for (int k = 0; k < K; ++k) {
        for (int oh = 0; oh < OH; ++oh) {
          int h_start = oh * stride - pad;
          for (int ow = 0; ow < OW; ++ow) {
            int w_start = ow * stride - pad;
            float o = 0.0f;
            for (int c = 0; c < C; ++c) {
              if((S == 16) && (R % TSR) == 0) {
                for (int r = 0; r < R; r+=TSR) {
                  int h0 = h_start + (r+0) * dilation;
                  int h1 = h_start + (r+1) * dilation;
                  int h2 = h_start + (r+2) * dilation;
                  int h3 = h_start + (r+3) * dilation;
                  float local_input[TSR][TSS];
                    for (int s = 0; s < S; s++) {
                      int w = w_start + s * dilation;
                      if(w < 0 || w >= W) {
                        local_input[0][s] = 0;
                        local_input[1][s] = 0;
                        local_input[2][s] = 0;
                        local_input[3][s] = 0;
                      }
                      else {
                        if (h0 < 0 || h0 >= H)
                          local_input[0][s] = 0;
                        else
                          local_input[0][s] = input[n * C * H * W + c * H * W + h0 * W + w];
                        if (h1 < 0 || h1 >= H)
                          local_input[1][s] = 0;
                        else
                          local_input[1][s] = input[n * C * H * W + c * H * W + h1 * W + w];
                        if (h2 < 0 || h2 >= H)
                          local_input[2][s] = 0;
                        else
                          local_input[2][s] = input[n * C * H * W + c * H * W + h2 * W + w];
                        if (h3 < 0 || h3 >= H)
                          local_input[3][s] = 0;
                        else
                          local_input[3][s] = input[n * C * H * W + c * H * W + h3 * W + w];
                      }
                    }
                    //case1
                    __m512 i0 = _mm512_loadu_ps(&local_input[0][0]);
                    __m512 i1 = _mm512_loadu_ps(&local_input[1][0]);
                    __m512 i2 = _mm512_loadu_ps(&local_input[2][0]);
                    __m512 i3 = _mm512_loadu_ps(&local_input[3][0]);
                    __m512 f0 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+0) * S + 0]);
                    __m512 f1 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+1) * S + 0]);
                    __m512 f2 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+2) * S + 0]);
                    __m512 f3 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+3) * S + 0]);
                    __m512 to0 = _mm512_mul_ps(i0, f0);
                    __m512 to1 = _mm512_mul_ps(i1, f1);
                    __m512 to2 = _mm512_mul_ps(i2, f2);
                    __m512 to3 = _mm512_mul_ps(i3, f3);
                    o += _mm512_reduce_add_ps(to0);
                    o += _mm512_reduce_add_ps(to1);
                    o += _mm512_reduce_add_ps(to2);
                    o += _mm512_reduce_add_ps(to3);
                }
              }
              else {
                for (int r = 0; r < R; ++r) {
                  int h = h_start + r * dilation;
                  for (int s = 0; s < S; ++s) {
                    int w = w_start + s * dilation;
                    if (h < 0 || h >= H || w < 0 || w >= W) continue;
                    float i = input[n * C * H * W + c * H * W + h * W + w];
                    float f = filter[k * C * R * S + c * R * S + r * S + s];
                    o += i * f;
                  }
                }
              }
            }
            output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
          }
        }
      }
    }
  //send output
  if (mpi_rank == 0) {
    for(int proc=1; proc<mpi_world_size; proc++)
      if(ncounts[proc] != 0)
        MPI_Recv(output+(ns[proc]*K*OH*OW), ncounts[proc]*K*OH*OW, MPI_FLOAT, proc /*src*/, proc /*tag*/, MPI_COMM_WORLD, &status);
  }
  else {
    if(ncounts[mpi_rank] != 0)
      MPI_Isend(output, ncounts[mpi_rank]*K*OH*OW, MPI_FLOAT, 0 /*dst*/, mpi_rank /*tag*/, MPI_COMM_WORLD, &request);
  }
}


void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
  N = _N; C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  //moved:OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  //moved:OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

  //moved:MPI_Status status;
  //moved:MPI_Request request;

  //moved://send output
  //moved:if (mpi_rank == 0) {
  //moved:  for(int proc=1; proc<mpi_world_size; proc++)
  //moved:    if(ncounts[proc] != 0)
  //moved:      MPI_Recv(output+(ns[proc]*K*OH*OW), ncounts[proc]*K*OH*OW, MPI_FLOAT, proc /*src*/, proc /*tag*/, MPI_COMM_WORLD, &status);
  //moved:}
  //moved:else {
  //moved:  if(ncounts[mpi_rank] != 0)
  //moved:    MPI_Isend(output, ncounts[mpi_rank]*K*OH*OW, MPI_FLOAT, 0 /*dst*/, mpi_rank /*tag*/, MPI_COMM_WORLD, &request);
  //moved:}

}
. 2022-09-29 18:01:45 +09:00			`#include "convolution.h"`
			`#include <mpi.h>`
			`#include <stdio.h>`
			`#include <CL/cl.h>`
			`#include <cuda_runtime.h>`
			`#include <immintrin.h>`


			`static float input, output, *filter;`
			`static int N, C, H, W;`
			`static int K, R, S;`
			`static int OH, OW;`
			`static int pad;`
			`static int dilation;`
			`static int stride;`
			`static int mpi_rank, mpi_world_size;`

			`static int my_min(int x, int y) {`
			`return x < y ? x : y;`
			`}`

			`void my_alloc_tensor(float **t, int D0, int D1, int D2, int D3) {`
			`t = (float ) aligned_alloc(32, sizeof(float) * D0 * D1 * D2 * D3);`
			`if (*t == NULL) {`
			`printf("Failed to allocate memory for matrix.\n");`
			`exit(0);`
			`}`
			`}`

			`int ns[4];`
			`int ne[4];`
			`int ncounts[4];`

			`#define NUM_THREADS 40`
			`#define TSR 4`
			`#define TSS 16`

			`void convolution(`
			`float _input, float _output, float *_filter,`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`
			`input = _input;`
			`output = _output;`
			`filter = _filter;`

			`MPI_Status status;`
			`MPI_Request request;`

			`OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`for(int idx=0; idx<mpi_world_size; idx++) {`
			`ns[idx] = N / mpi_world_size * idx + my_min(idx, N % mpi_world_size);`
			`if(idx == (mpi_world_size-1))`
			`ne[idx] = N;`
			`else`
			`ne[idx] = N / mpi_world_size * (idx + 1) + my_min(idx + 1, N % mpi_world_size);`
			`ncounts[idx] = ne[idx]-ns[idx];`
			`}`
			`//memory allocation`
			`if(mpi_rank != 0) {`
			`my_alloc_tensor(&input, N, C, H, W);`
			`my_alloc_tensor(&output, N, K, OH, OW);`
			`my_alloc_tensor(&filter, K, C, R, S);`
			`}`
			`//send input`
			`if (mpi_rank == 0) {`
			`for(int proc=1; proc<mpi_world_size; proc++)`
			`if(ncounts[proc] != 0)`
			`MPI_Isend(input+ns[proc]CHW, ncounts[proc]CHW ,MPI_FLOAT, proc/dst/, proc /tag/, MPI_COMM_WORLD, &request);`
			`}`
			`else {`
			`if(ncounts[mpi_rank] != 0)`
			`MPI_Recv(input, ncounts[mpi_rank]CHW, MPI_FLOAT, 0 /src/, mpi_rank /tag*/, MPI_COMM_WORLD, &status);`
			`}`
			`//send filter`
			`MPI_Bcast(filter, KCRS, MPI_FLOAT, 0 /src*/, MPI_COMM_WORLD);`
			`#pragma omp parallel for num_threads(NUM_THREADS) collapse(3) schedule(dynamic)`
			`for (int n = 0; n < ncounts[mpi_rank]; ++n) {`
			`for (int k = 0; k < K; ++k) {`
			`for (int oh = 0; oh < OH; ++oh) {`
			`int h_start = oh * stride - pad;`
			`for (int ow = 0; ow < OW; ++ow) {`
			`int w_start = ow * stride - pad;`
			`float o = 0.0f;`
			`for (int c = 0; c < C; ++c) {`
			`if((S == 16) && (R % TSR) == 0) {`
			`for (int r = 0; r < R; r+=TSR) {`
			`int h0 = h_start + (r+0) * dilation;`
			`int h1 = h_start + (r+1) * dilation;`
			`int h2 = h_start + (r+2) * dilation;`
			`int h3 = h_start + (r+3) * dilation;`
			`float local_input[TSR][TSS];`
			`for (int s = 0; s < S; s++) {`
			`int w = w_start + s * dilation;`
			`if(w < 0 \|\| w >= W) {`
			`local_input[0][s] = 0;`
			`local_input[1][s] = 0;`
			`local_input[2][s] = 0;`
			`local_input[3][s] = 0;`
			`}`
			`else {`
			`if (h0 < 0 \|\| h0 >= H)`
			`local_input[0][s] = 0;`
			`else`
			`local_input[0][s] = input[n * C * H * W + c * H * W + h0 * W + w];`
			`if (h1 < 0 \|\| h1 >= H)`
			`local_input[1][s] = 0;`
			`else`
			`local_input[1][s] = input[n * C * H * W + c * H * W + h1 * W + w];`
			`if (h2 < 0 \|\| h2 >= H)`
			`local_input[2][s] = 0;`
			`else`
			`local_input[2][s] = input[n * C * H * W + c * H * W + h2 * W + w];`
			`if (h3 < 0 \|\| h3 >= H)`
			`local_input[3][s] = 0;`
			`else`
			`local_input[3][s] = input[n * C * H * W + c * H * W + h3 * W + w];`
			`}`
			`}`
			`//case1`
			`__m512 i0 = _mm512_loadu_ps(&local_input[0][0]);`
			`__m512 i1 = _mm512_loadu_ps(&local_input[1][0]);`
			`__m512 i2 = _mm512_loadu_ps(&local_input[2][0]);`
			`__m512 i3 = _mm512_loadu_ps(&local_input[3][0]);`
			`__m512 f0 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+0) * S + 0]);`
			`__m512 f1 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+1) * S + 0]);`
			`__m512 f2 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+2) * S + 0]);`
			`__m512 f3 = _mm512_loadu_ps(&filter[k * C * R * S + c * R * S + (r+3) * S + 0]);`
			`__m512 to0 = _mm512_mul_ps(i0, f0);`
			`__m512 to1 = _mm512_mul_ps(i1, f1);`
			`__m512 to2 = _mm512_mul_ps(i2, f2);`
			`__m512 to3 = _mm512_mul_ps(i3, f3);`
			`o += _mm512_reduce_add_ps(to0);`
			`o += _mm512_reduce_add_ps(to1);`
			`o += _mm512_reduce_add_ps(to2);`
			`o += _mm512_reduce_add_ps(to3);`
			`}`
			`}`
			`else {`
			`for (int r = 0; r < R; ++r) {`
			`int h = h_start + r * dilation;`
			`for (int s = 0; s < S; ++s) {`
			`int w = w_start + s * dilation;`
			`if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W) continue;`
			`float i = input[n * C * H * W + c * H * W + h * W + w];`
			`float f = filter[k * C * R * S + c * R * S + r * S + s];`
			`o += i * f;`
			`}`
			`}`
			`}`
			`}`
			`output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;`
			`}`
			`}`
			`}`
			`}`
			`//send output`
			`if (mpi_rank == 0) {`
			`for(int proc=1; proc<mpi_world_size; proc++)`
			`if(ncounts[proc] != 0)`
			`MPI_Recv(output+(ns[proc]KOHOW), ncounts[proc]KOHOW, MPI_FLOAT, proc /src/, proc /tag/, MPI_COMM_WORLD, &status);`
			`}`
			`else {`
			`if(ncounts[mpi_rank] != 0)`
			`MPI_Isend(output, ncounts[mpi_rank]KOHOW, MPI_FLOAT, 0 /dst/, mpi_rank /tag*/, MPI_COMM_WORLD, &request);`
			`}`
			`}`


			`void convolution_init(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`
			`N = _N; C = _C; H = _H; W = _W;`
			`K = _K; R = _R; S = _S;`
			`pad = _pad;`
			`dilation = _dilation;`
			`stride = _stride;`

			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`//moved:OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`//moved:OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`}`

			`void convolution_final(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`

			`//moved:MPI_Status status;`
			`//moved:MPI_Request request;`

			`//moved://send output`
			`//moved:if (mpi_rank == 0) {`
			`//moved: for(int proc=1; proc<mpi_world_size; proc++)`
			`//moved: if(ncounts[proc] != 0)`
			`//moved: MPI_Recv(output+(ns[proc]KOHOW), ncounts[proc]KOHOW, MPI_FLOAT, proc /src/, proc /tag/, MPI_COMM_WORLD, &status);`
			`//moved:}`
			`//moved:else {`
			`//moved: if(ncounts[mpi_rank] != 0)`
			`//moved: MPI_Isend(output, ncounts[mpi_rank]KOHOW, MPI_FLOAT, 0 /dst/, mpi_rank /tag*/, MPI_COMM_WORLD, &request);`
			`//moved:}`

			`}`