#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include "util.h"
#include <immintrin.h>

static float *__restrict input, *__restrict output, *__restrict filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;

#define SINGLE_NODE (0)
#define TIME_MEASURE (0)
#define ALIGN_UP(_A,_SIZE) ((((_A) + (_SIZE) - 1) / (_SIZE)) * (_SIZE))
#define MIN(_A,_B) ((_A) < (_B) ? (_A) : (_B))
#define OPTIMAL_FILTER_SIZE (16)
#define ENABLE_PREFETCH (1)
#if (ENABLE_PREFETCH)
#define MM_PREFETCH(__A, __B) _mm_prefetch(__A, __B)
#else
#define MM_PREFETCH(__A, __B)
#endif


static inline void Calculation_Opt3(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = 16 * 16;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;
#if 0
  float* pnInputend = input + N * C * H * W;
  float* pnFilterend = filter + K * C * H * W;
  float* pnoutputend = output + N * K * OH * OW;
  #endif
  #if (SINGLE_NODE)  
  zero_tensor((float*)output, N, K, OH, OW);
  #else
  zero_tensor((float*)output, N/2, K, OH, OW);
  #endif
// printf("Optimal calculation :) \n");
// N, C, K, H, W: 32 이상의 적당히 큰 2의 지수승
// R, S: 16
 #pragma omp parallel for num_threads(100) collapse(3) schedule(static)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    //printf ("N : %d\n", n);
    for (int k = 0; k < K; ++k) {      
      for (int c = 0; c < C; c++) {        
        const float* pnStartFilter = &filter[k * CRS + (c * RS)];
        const __m512 b0 = _mm512_load_ps(&pnStartFilter[0]);
        const __m512 b1 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 1]);
        const __m512 b2 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 2]);
        const __m512 b3 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 3]);
        const __m512 b4 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 4]);
        const __m512 b5 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 5]);
        const __m512 b6 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 6]);
        const __m512 b7 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 7]);
              
        const __m512 b8 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 8]);
        const __m512 b9 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 9]);
        const __m512 b10 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 10]);
        const __m512 b11 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 11]);
        const __m512 b12 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 12]);
        const __m512 b13 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 13]);
        const __m512 b14 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 14]);
        const __m512 b15 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 15]);

        const int cHW = c * HW;
        const int nCHW_cHW = n * CHW + cHW;
        const int nKOHOW_kOHOW = n * KOHOW + k * OHOW;
        MM_PREFETCH((const char*)&input[nCHW_cHW], _MM_HINT_T0);
        for (int oh = 0; oh < OH; ++oh) {
          const int nKOHOW_kOHOW_ohOW = nKOHOW_kOHOW + oh * OW;
          const int nCHW_ohW_cHW = nCHW_cHW + oh * W; 
        for (int ow = 0; ow < OW; ++ow) {
            const float* pnStartInput = &input[nCHW_ohW_cHW + ow];

            //printf ("ow : %d\n", ow);
            __m512 c0 = _mm512_setzero_ps();
            __m512 c1 = c0;
            __m512 c2 = c0;
            __m512 c3 = c0;        

            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[0]), b0, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 1]), b1, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 2]), b2, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 3]), b3, c3);
            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 4]), b4, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 5]), b5, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 6]), b6, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 7]), b7, c3);

            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 8]), b8, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 9]), b9, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 10]), b10, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 11]), b11, c3);
            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 12]), b12, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 13]), b13, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 14]), b14, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 15]), b15, c3);            

            const __m512 dot01 = _mm512_add_ps(c0, c1);
            const __m512 dot23 = _mm512_add_ps(c2, c3);
            const __m512 dot0123 = _mm512_add_ps(dot01, dot23);
            output[nKOHOW_kOHOW_ohOW + ow] += _mm512_reduce_add_ps(dot0123);
          }
        }
      }
    }
  }
}

static inline void Calculation_Opt2(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = 16 * 16;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;
#if 0
  float* pnInputend = input + N * C * H * W;
  float* pnFilterend = filter + K * C * H * W;
  float* pnoutputend = output + N * K * OH * OW;
  #endif
// printf("Optimal calculation :) \n");
// N, C, K, H, W: 32 이상의 적당히 큰 2의 지수승
// R, S: 16
 #pragma omp parallel for collapse(3) schedule(static)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    //printf ("N : %d\n", n);
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int kCRS = k * CRS;
        const int nKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int nCHW_ohW = n * CHW + oh * W; 
        
        MM_PREFETCH((const char*)&input[nCHW_ohW], _MM_HINT_T0);
        MM_PREFETCH((const char*)&filter[kCRS], _MM_HINT_T0);
        for (int ow = 0; ow < OW; ++ow) {
          const int nCHW_ohW_ow = nCHW_ohW + ow; 
              //printf ("ow : %d\n", ow);
          __m512 c0 = _mm512_setzero_ps();
          __m512 c1 = c0;
          __m512 c2 = c0;
          __m512 c3 = c0;
#pragma GCC unroll 4
          for (int c = 0; c < C; c++) {
            const float* pnStartInput = &input[nCHW_ohW_ow + c * HW];
            const float* pnStartFilter = &filter[kCRS + (c * RS)];

            // printf ("Input & filter i : %x f : %x \n", pnStartInput, pnStartFilter);
#if 0
            if (pnInputend < &pnStartInput[W * 16])
            {
              printf ("Input Assert! n : %d k : %d oh : %d ow : %d c :%d \n", n, k, oh, ow, c);
            }

            if (pnFilterend < &pnStartFilter[OPTIMAL_FILTER_SIZE * 15])
            {
              printf ("Filter Assert! n : %d k : %d oh : %d ow : %d c :%d \n", n, k, oh, ow, c);
            }
#endif

            const __m512 b0 = _mm512_load_ps(&pnStartFilter[0]);
            const __m512 b1 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 1]);
            const __m512 b2 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 2]);
            const __m512 b3 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 3]);
            const __m512 b4 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 4]);
            const __m512 b5 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 5]);
            const __m512 b6 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 6]);
            const __m512 b7 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 7]);


            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[0]), b0, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 1]), b1, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 2]), b2, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 3]), b3, c3);
            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 4]), b4, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 5]), b5, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 6]), b6, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 7]), b7, c3);

            
            const __m512 b8 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 8]);
            const __m512 b9 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 9]);
            const __m512 b10 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 10]);
            const __m512 b11 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 11]);
            const __m512 b12 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 12]);
            const __m512 b13 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 13]);
            const __m512 b14 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 14]);
            const __m512 b15 = _mm512_load_ps(&pnStartFilter[OPTIMAL_FILTER_SIZE * 15]);

            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 8]), b8, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 9]), b9, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 10]), b10, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 11]), b11, c3);
            c0 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 12]), b12, c0);
            c1 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 13]), b13, c1);
            c2 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 14]), b14, c2);
            c3 = _mm512_fmadd_ps(_mm512_loadu_ps(&pnStartInput[W * 15]), b15, c3);
          }

          const __m512 dot01 = _mm512_add_ps(c0, c1);
          const __m512 dot23 = _mm512_add_ps(c2, c3);
          const __m512 dot0123 = _mm512_add_ps(dot01, dot23);
          output[nKOHOW_kOHOW_ohOW + ow] = _mm512_reduce_add_ps(dot0123);
        }
      }
    }
  }
}


static inline void Calculation_Opt(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = 16 * 16;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  // printf("Optimal calculation :) \n");
// N, C, K, H, W: 32 이상의 적당히 큰 2의 지수승
// R, S: 16
  #pragma omp parallel for collapse(3) schedule(dynamic)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int rBound = MIN(16, H - oh);
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          o = 0.f;
          const int sBound = MIN(16, W - ow);
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + (c * RS);
            for (int r = 0; r < rBound; ++r) {
                const int h = oh + r;
                const int kCRS_cRS_rS = kCRS_cRS + r * 16;
                const int NCHW_CHW_hW_ow = NCHW_CHW + h * W + ow;
                for (int s = 0; s < sBound; ++s) {
                  o += input[NCHW_CHW_hW_ow + s] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Pad0_D1_S1(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int rBound = MIN(R, H - oh);
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int sBound = MIN(S, W - ow);
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < rBound; ++r) {
                const int h = oh + r;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                const int NCHW_CHW_hW_ow = NCHW_CHW + h * W + ow;
                for (int s = 0; s < sBound; ++s) {
                  o += input[NCHW_CHW_hW_ow + s] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Pad0_Dilation1(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int hi = oh * stride;
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int wi = ow * stride;
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < R; ++r) {
                const int h = hi + r;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                if (h >= H ) continue;
                const int NCHW_CHW_hW = NCHW_CHW + h * W;
                for (int s = 0; s < S; ++s) {
                  const int w = wi + s;
                  if (w >= W) continue;
                  //float i = input[NCHW_CHW_hW + w];
                  //float f = filter[kCRS_cRS_rS + s];
                  o += input[NCHW_CHW_hW + w] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Stride1(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int hi = oh - pad;
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int wi = ow - pad;
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < R; ++r) {
                const int h = hi + r * dilation;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                if (h < 0 || h >= H ) continue;
                const int NCHW_CHW_hW = NCHW_CHW + h * W;
                for (int s = 0; s < S; ++s) {
                  const int w = wi + s * dilation;
                  if (w < 0 || w >= W) continue;
                  //float i = input[NCHW_CHW_hW + w];
                  //float f = filter[kCRS_cRS_rS + s];
                  o += input[NCHW_CHW_hW + w] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Pad0(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int hi = oh * stride;
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int wi = ow * stride;
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < R; ++r) {
                const int h = hi + r * dilation;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                if (h >= H ) continue;
                const int NCHW_CHW_hW = NCHW_CHW + h * W;
                for (int s = 0; s < S; ++s) {
                  const int w = wi + s * dilation;
                  if (w >= W) continue;
                  //float i = input[NCHW_CHW_hW + w];
                  //float f = filter[kCRS_cRS_rS + s];
                  o += input[NCHW_CHW_hW + w] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Dilation1(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int hi = oh * stride - pad;
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int wi = ow * stride - pad;
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < R; ++r) {
                const int h = hi + r;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                if (h < 0 || h >= H ) continue;
                const int NCHW_CHW_hW = NCHW_CHW + h * W;
                for (int s = 0; s < S; ++s) {
                  const int w = wi + s;
                  if (w < 0 || w >= W) continue;
                  //float i = input[NCHW_CHW_hW + w];
                  //float f = filter[kCRS_cRS_rS + s];
                  o += input[NCHW_CHW_hW + w] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static void Calculation_Base(int nStart, int nEnd)
{
  const int HW = H * W;
  const int CHW = C * HW;
  const int RS = R * S;
  const int CRS = C * RS;
  const int OHOW = OH * OW;
  const int KOHOW = K * OHOW;

  #pragma omp parallel for collapse(3)
#if (!SINGLE_NODE)
  for (int n = nStart; n < nEnd; ++n) {
#else
  for (int n = 0; n < N; ++n) {
#endif
    for (int k = 0; k < K; ++k) {
      for (int oh = 0; oh < OH; ++oh) {
        const int NCHW = n * CHW;
        //const int nKOHOW = n * KOHOW;
        const int kCRS = k * CRS;
        // const int kKOHOW_kOHOW = nKOHOW + k * OHOW;
        const int kKOHOW_kOHOW_ohOW = n * KOHOW + k * OHOW + oh * OW;
        const int hi = oh * stride - pad;
        for (int ow = 0; ow < OW; ++ow) {
          float o = 0.f;
          const int wi = ow * stride - pad;
          for (int c = 0; c < C; ++c) {
            const int NCHW_CHW = NCHW + c * HW;
            const int kCRS_cRS = kCRS + c * RS;
            for (int r = 0; r < R; ++r) {
                const int h = hi + r * dilation;
                const int kCRS_cRS_rS = kCRS_cRS + r * S;
                if (h < 0 || h >= H ) continue;
                const int NCHW_CHW_hW = NCHW_CHW + h * W;
                for (int s = 0; s < S; ++s) {
                  const int w = wi + s * dilation;
                  if (w < 0 || w >= W) continue;
                  //float i = input[NCHW_CHW_hW + w];
                  //float f = filter[kCRS_cRS_rS + s];
                  o += input[NCHW_CHW_hW + w] * filter[kCRS_cRS_rS + s];
              }
            }
          }
          output[kKOHOW_kOHOW_ohOW + ow] = o;
        }
      }
    }
  }
}

static inline void Calculation(int start, int end)
{
  if (pad == 0)
  {
    if (dilation == 1)
    {
      if (stride == 1)
      {
        // [TODO] optimal algorithm
        Calculation_Pad0_D1_S1(start, end);
      }
      else
      {
        Calculation_Pad0_Dilation1(start, end);
      }
    }
    else
    {
      Calculation_Pad0(start, end);
    }
  }
  else if (dilation == 1)
  {
    Calculation_Dilation1(start, end);
  }
  else if (stride == 1)
  {
    Calculation_Stride1(start, end);
  }
  else
  {
    Calculation_Base(start, end);
  }
}

#define OPTIMAL_NODE_CNT (2)
#define MPI_CH_CNT (4)
#define MPI_FILTER_CH_CNT (MPI_CH_CNT * OPTIMAL_NODE_CNT)
void convolution(
    float *_input, float *_output, float *_filter, 
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

#if (!SINGLE_NODE)
  MPI_Status stMpiStatus;
  MPI_Request stMpiRequest[12];
#endif

  if (_pad == 0
      && _dilation == 1
      && _stride == 1
#if (!SINGLE_NODE)
      && mpi_world_size == OPTIMAL_NODE_CNT
  #endif
      && (((N | C | K | H | W) & (32 - 1)) == 0)
      && R == 16 && S == 16)
      {
#if (!SINGLE_NODE)
        const int SendNodeSize = N / OPTIMAL_NODE_CNT / MPI_CH_CNT;
        const int SendFilterSize = K / MPI_CH_CNT;
#endif
        // Optimal path
        if (mpi_rank == 0) {
          input = _input;
          output = _output;
          filter = _filter;
      #if (!SINGLE_NODE)
          MPI_Isend(input + (SendNodeSize * (MPI_CH_CNT + 0)) * C * H * W, (SendNodeSize) * C * H * W , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
          MPI_Isend(input + (SendNodeSize * (MPI_CH_CNT + 1)) * C * H * W, (SendNodeSize) * C * H * W , MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
          MPI_Isend(input + (SendNodeSize * (MPI_CH_CNT + 2)) * C * H * W, (SendNodeSize) * C * H * W , MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[2]);
          MPI_Isend(input + (SendNodeSize * (MPI_CH_CNT + 3)) * C * H * W, (SendNodeSize) * C * H * W , MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[3]);
          MPI_Isend(filter                                 , SendFilterSize * C * R * S, MPI_FLOAT, 1, 4, MPI_COMM_WORLD, &stMpiRequest[4]);
          MPI_Isend(filter + SendFilterSize * C * R * S    , SendFilterSize * C * R * S, MPI_FLOAT, 1, 5, MPI_COMM_WORLD, &stMpiRequest[5]);
          MPI_Isend(filter + SendFilterSize * 2 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 1, 6, MPI_COMM_WORLD, &stMpiRequest[6]);
          MPI_Isend(filter + SendFilterSize * 3 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 1, 7, MPI_COMM_WORLD, &stMpiRequest[7]);
          MPI_Irecv(output + (SendNodeSize * (MPI_CH_CNT + 0)) * K * OH * OW, (SendNodeSize) * K * OH * OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[8]);
          MPI_Irecv(output + (SendNodeSize * (MPI_CH_CNT + 1)) * K * OH * OW, (SendNodeSize) * K * OH * OW, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[9]);
          MPI_Irecv(output + (SendNodeSize * (MPI_CH_CNT + 2)) * K * OH * OW, (SendNodeSize) * K * OH * OW, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[10]);
          MPI_Irecv(output + (SendNodeSize * (MPI_CH_CNT + 3)) * K * OH * OW, (SendNodeSize) * K * OH * OW, MPI_FLOAT, 1, 3, MPI_COMM_WORLD, &stMpiRequest[11]);
          //printf("Master receive : %x, %x, %x, %x\n", (SendNodeSize * MPI_CH_CNT + 0), (SendNodeSize * MPI_CH_CNT + 1), (SendNodeSize * MPI_CH_CNT + 2), (SendNodeSize * MPI_CH_CNT + 3));
      #if (TIME_MEASURE)
          printf("Master send started : %f sec\n", timer_stop(0));
      #endif
      #endif
      // printf ("Optimized path! SendNodeOffset : %d, mpi_world_size : %d", N / OPTIMAL_NODE_CNT / MPI_CH_CNT, mpi_world_size);
      #if (!SINGLE_NODE)
            Calculation_Opt3(0, N / OPTIMAL_NODE_CNT);
      #else
            Calculation_Opt3(0, N);
      #endif

      #if (TIME_MEASURE)
          printf("Master calculation complete : %f sec\n", timer_stop(0));
      #endif
      #if (!SINGLE_NODE)
      #pragma GCC unroll 12
          for(int i = 0; i < 12; i++)
          {
            MPI_Wait(&stMpiRequest[i], &stMpiStatus);
          }
      #if (TIME_MEASURE)
          printf("Master recieve complete : %f sec\n", timer_stop(0));
      #endif
    #endif
      }
#if (!SINGLE_NODE)
      else
      {
        // printf("Check memory pointer, input : %x Filter :% x output : %x\n", input, filter, output);
        MPI_Irecv(input + (SendNodeSize * 0) * C * H * W, SendNodeSize * C * H * W , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
        MPI_Irecv(input + (SendNodeSize * 1) * C * H * W, SendNodeSize * C * H * W , MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
        MPI_Irecv(input + (SendNodeSize * 2) * C * H * W, SendNodeSize * C * H * W , MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[2]);
        MPI_Irecv(input + (SendNodeSize * 3) * C * H * W, SendNodeSize * C * H * W , MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[3]);
        MPI_Irecv(filter + SendFilterSize * 0 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 0, 4, MPI_COMM_WORLD, &stMpiRequest[4]);
        MPI_Irecv(filter + SendFilterSize * 1 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 0, 5, MPI_COMM_WORLD, &stMpiRequest[5]);
        MPI_Irecv(filter + SendFilterSize * 2 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 0, 6, MPI_COMM_WORLD, &stMpiRequest[6]);
        MPI_Irecv(filter + SendFilterSize * 3 * C * R * S, SendFilterSize * C * R * S, MPI_FLOAT, 0, 7, MPI_COMM_WORLD, &stMpiRequest[7]);

      #pragma GCC unroll 8
          for(int i = 0; i < 8; i++)
          {
            MPI_Wait(&stMpiRequest[i], &stMpiStatus);
          }
    #if (TIME_MEASURE)
        printf("Slave receive complete : %f sec\n", timer_stop(0));
    #endif
        Calculation_Opt3(0, N / OPTIMAL_NODE_CNT);
        
        // printf("Slave set from output[%d], stride all : [%d]\n", SendNodeOffset, SendNodeOffset * KOHOW);    
        // printf("Slave set end output[%d], stride all : [%d]\n", N, N * KOHOW);
    #if (TIME_MEASURE)
        printf("Slave calculation complete : %f sec\n", timer_stop(0));
    #endif
        // printf("Slave send from output[%d]\n", SendNodeOffset * C * OH * OW);
        MPI_Isend(output                                   , SendNodeSize * K * OH * OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
        MPI_Isend(output + (SendNodeSize * 1) * K * OH * OW, SendNodeSize * K * OH * OW, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
        MPI_Isend(output + (SendNodeSize * 2) * K * OH * OW, SendNodeSize * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[2]);
        MPI_Isend(output + (SendNodeSize * 3) * K * OH * OW, SendNodeSize * K * OH * OW, MPI_FLOAT, 0, 3, MPI_COMM_WORLD, &stMpiRequest[3]);
        //printf ("FirstData : %3.f, %3.f, %3.f, %3.f\n", *(output + (SendNodeSize *0) * K * OH * OW),*(output + (SendNodeSize *1) * K * OH * OW),*(output + (SendNodeSize *2) * K * OH * OW),*(output + (SendNodeSize *3) * K * OH * OW));
        //printf("Slave send : %x, %x, %x, %x\n",(SendNodeSize * 0) * K * OH * OW, (SendNodeSize * 1) * K * OH * OW, (SendNodeSize * 2) * K * OH * OW, (SendNodeSize * 3) * K * OH * OW);
#pragma GCC unroll 4
        for(int i = 0; i < 4; i++)
        {
          MPI_Wait(&stMpiRequest[i], &stMpiStatus);
        }
    #if (TIME_MEASURE)
        printf("Slave send complete : %f sec\n", timer_stop(0));
    #endif
        }
#endif // SINGLE_NODE
      }
      else
      {
#if (!SINGLE_NODE)
        const int SendNodeSize = N / 2;
        const int SendNodeOffset = mpi_world_size > 1 ? (N - SendNodeSize) : N;
#endif
      if (mpi_rank == 0) {
        input = _input;
        output = _output;
        filter = _filter;
    #if (!SINGLE_NODE)
        if (mpi_world_size > 1 && SendNodeSize > 0)
        {
          MPI_Isend(input + SendNodeOffset * C * H * W, SendNodeSize * C * H * W , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
          MPI_Isend(filter, K * C * R * S, MPI_FLOAT, 1, 1, MPI_COMM_WORLD, &stMpiRequest[1]);
          MPI_Irecv(output + SendNodeOffset * K * OH * OW, SendNodeSize * K * OH * OW, MPI_FLOAT, 1, 2, MPI_COMM_WORLD, &stMpiRequest[2]);
    #if (TIME_MEASURE)
        printf("Master send started : %f sec\n", timer_stop(0));
    #endif
        }
    #endif
    #if (!SINGLE_NODE)
          // printf ("SendNodeOffset : %d, mpi_world_size : %d", SendNodeOffset, mpi_world_size);
          Calculation(0, SendNodeOffset);
    #else
          Calculation(0, N);
    #endif

    #if (TIME_MEASURE)
        printf("Master calculation complete : %f sec\n", timer_stop(0));
    #endif
    #if (!SINGLE_NODE)
        if (mpi_world_size > 1 && SendNodeSize > 0)
        {
          MPI_Wait(&stMpiRequest[0], &stMpiStatus);
          MPI_Wait(&stMpiRequest[1], &stMpiStatus);
          MPI_Wait(&stMpiRequest[2], &stMpiStatus);
    #if (TIME_MEASURE)
        printf("Master recieve complete : %f sec\n", timer_stop(0));
    #endif
        }
    #endif
      }
    #if (!SINGLE_NODE)
      else if (SendNodeSize)
      {
        // printf("Check memory pointer, input : %x Filter :% x output : %x\n", input, filter, output);
        MPI_Irecv(input, SendNodeSize * C * H * W , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &stMpiRequest[0]);
        MPI_Irecv(filter, K * C * R * S, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &stMpiRequest[1]);

        MPI_Wait(&stMpiRequest[0], &stMpiStatus);
        MPI_Wait(&stMpiRequest[1], &stMpiStatus);
    #if (TIME_MEASURE)
        printf("Slave receive complete : %f sec\n", timer_stop(0));
    #endif
        Calculation(0, SendNodeSize);
        
        // printf("Slave set from output[%d], stride all : [%d]\n", SendNodeOffset, SendNodeOffset * KOHOW);    
        // printf("Slave set end output[%d], stride all : [%d]\n", N, N * KOHOW);
    #if (TIME_MEASURE)
        printf("Slave calculation complete : %f sec\n", timer_stop(0));
    #endif
        
        
        // printf("Slave send from output[%d]\n", SendNodeOffset * C * OH * OW);
        MPI_Isend(output, SendNodeSize * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &stMpiRequest[2]); 
        MPI_Wait(&stMpiRequest[2], &stMpiStatus);
    #if (TIME_MEASURE)
        printf("Slave send complete : %f sec\n", timer_stop(0));
    #endif
      }
    #endif
  }  
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
  N = _N; C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;
  
  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
  // printf("MPI rank : %d MPI world size :%d\n", mpi_rank, mpi_world_size);

  if (mpi_rank != 0)
  {
    const int SendNodeSize = N / 2;

    alloc_tensor((float**)&input, SendNodeSize, _C, _H, _W);
    alloc_tensor((float**)&output, SendNodeSize, _K, OH, OW);
    alloc_tensor((float**)&filter, _K, _C, _R, _S);   
    
    // printf("Set slave memory pointer, input : %x Filter :% x output : %x\n", input, filter, output);
  }
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
/*
  if (mpi_rank != 0)
  {    
    free(input);
    free(output);
    free(filter);
  }
  */
}