#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include "util.h"
#include <immintrin.h>

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;

int num_threads = 200;

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

    int size[2];
    MPI_Request     request;
    MPI_Status      status;

    int CHW             ;
    int HW              ;
    int CRS             ;
    int RS              ;

    int KOHOW           ;
    int OHOW            ;

    input   = _input    ;
    output  = _output   ;
    filter  = _filter   ;

    //  Asymetric load balancing because of MPI communiation
    if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.45f);
    else                     size[1] = 0                        ;
    size[0] = N - size[1];

    OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
    OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

    if (mpi_rank == 0 && mpi_world_size == 2) {
        MPI_Isend(&input[size[0]*C*H*W], size[1]*C*H*W, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
        MPI_Isend( filter              , K*C*R*S      , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
    }
    else if (mpi_world_size == 2) {
        alloc_tensor(&input , size[1], C, H , W );
        alloc_tensor(&output, size[1], K, OH, OW);
        alloc_tensor(&filter, K      , C, R , S );
        MPI_Recv(input , size[1]*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
        MPI_Recv(filter, K*C*R*S      , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
    }

    CHW     =   C * H * W   ;
    HW      =   H * W       ;
    CRS     =   C * R * S   ;
    RS      =   R * S       ;

    KOHOW   =   K * OH * OW ;
    OHOW    =   OH * OW     ;

    if (pad == 0 && dilation == 1 && stride == 1 && (S % 16 == 0)){
  //if (0){
    //  pad == 0 --> No need to check input boundary
    //  dilation 1 && stride == 1 --> No multiplication to comput input / filter array index
    //  S % 16 == 0 ---> vector operation is possible
    #pragma omp parallel for num_threads(num_threads) collapse(3) schedule(dynamic)
        for (int n = 0; n < size[mpi_rank]; ++n) {
            for (int k = 0; k < K; ++k) {
                for (int oh = 0; oh < OH; ++oh) {
                    int o_base  =   n  * KOHOW + k * OHOW + oh * OW ;

                    for (int ow = 0; ow < OW; ++ow) {
                        int     o_idx   =   o_base + ow             ;
                        __m512  out0    =  _mm512_setzero_ps()      ;

                        for (int c = 0; c < C; ++c) {
                            int i_base  =   n * CHW + c * HW        ;
                            int f_base  =   k * CRS + c * RS        ;

                            for (int r  = 0 ; r < R  ; ++r) {
                                int h   =   oh + r  ;

                                for (int s  = 0; s < S ; s += 16 ) {
                                    int w   = ow  +  s  ;

                                    __m512 i0   =   _mm512_loadu_ps(&input [i_base + h * W + w]);
                                    __m512 f0   =   _mm512_loadu_ps(&filter[f_base + r * S + s]);

                                    out0        =   _mm512_fmadd_ps(f0, i0, out0)   ;
                                } // s
                            } // r
                        } // c
                        output[o_idx] = _mm512_reduce_add_ps(out0);
                    } //ow
                } // oh
            } // k
        } //
    } else {
    //  pad != 0 || dilation != 1 || stride != 1
    //  ---> Input Boundary check required or
    //  ---> Multiplication required to compute indices
    #pragma omp parallel for num_threads(num_threads) collapse(3) schedule(dynamic)
        for (int n = 0; n < size[mpi_rank]; ++n) {
            for (int k = 0; k < K; ++k) {
                for (int oh = 0; oh < OH; ++oh) {
                    for (int ow = 0; ow < OW; ++ow) {
                        float   o       =   0.f;
                        int     o_idx   =   n  * K  * OH * OW +
                                            k  * OH * OW +
                                            oh * OW +
                                            ow ;

                        for (int c = 0; c < C; ++c) {

                            int i_base  =   n * C * H * W   +
                                            c * H * W       ;
                            int f_base  =   k * C * R * S   +
                                            c * R * S       ;

                            int h_base  =   oh * stride - pad   ;

                            for (int r = 0; r < R - 7 ; r += 8) {
                                int     h[8]    ;

                                h[0] = h_base + (r +  0) * dilation;
                                h[1] = h_base + (r +  1) * dilation;
                                h[2] = h_base + (r +  2) * dilation;
                                h[3] = h_base + (r +  3) * dilation;
                                h[4] = h_base + (r +  4) * dilation;
                                h[5] = h_base + (r +  5) * dilation;
                                h[6] = h_base + (r +  6) * dilation;
                                h[7] = h_base + (r +  7) * dilation;

                                for (int s = 0; s < S ; ++s) {

                                    float   i[8]    ;
                                    float   f[8]    ;
                                    int     w   = ow * stride - pad + s * dilation;

                                    if (h[0] >= 0 && h[0] < H && w >= 0 && w < W)   { i[0] = input[i_base + h[0] * W + w] ; }
                                    else                                            { i[0] = 0.0f                         ; }

                                    if (h[1] >= 0 && h[1] < H && w >= 0 && w < W)   { i[1] = input[i_base + h[1] * W + w] ; }
                                    else                                            { i[1] = 0.0f                         ; }

                                    if (h[2] >= 0 && h[2] < H && w >= 0 && w < W)   { i[2] = input[i_base + h[2] * W + w] ; }
                                    else                                            { i[2] = 0.0f                         ; }

                                    if (h[3] >= 0 && h[3] < H && w >= 0 && w < W)   { i[3] = input[i_base + h[3] * W + w] ; }
                                    else                                            { i[3] = 0.0f                         ; }

                                    if (h[4] >= 0 && h[4] < H && w >= 0 && w < W)   { i[4] = input[i_base + h[4] * W + w] ; }
                                    else                                            { i[4] = 0.0f                         ; }

                                    if (h[5] >= 0 && h[5] < H && w >= 0 && w < W)   { i[5] = input[i_base + h[5] * W + w] ; }
                                    else                                            { i[5] = 0.0f                         ; }

                                    if (h[6] >= 0 && h[6] < H && w >= 0 && w < W)   { i[6] = input[i_base + h[6] * W + w] ; }
                                    else                                            { i[6] = 0.0f                         ; }

                                    if (h[7] >= 0 && h[7] < H && w >= 0 && w < W)   { i[7] = input[i_base + h[7] * W + w] ; }
                                    else                                            { i[7] = 0.0f                         ; }

                                    f[0] = filter[f_base + (r +  0) * S + s];
                                    f[1] = filter[f_base + (r +  1) * S + s];
                                    f[2] = filter[f_base + (r +  2) * S + s];
                                    f[3] = filter[f_base + (r +  3) * S + s];
                                    f[4] = filter[f_base + (r +  4) * S + s];
                                    f[5] = filter[f_base + (r +  5) * S + s];
                                    f[6] = filter[f_base + (r +  6) * S + s];
                                    f[7] = filter[f_base + (r +  7) * S + s];

                                    o = i[0] * f[0] +
                                        i[1] * f[1] +
                                        i[2] * f[2] +
                                        i[3] * f[3] +
                                        i[4] * f[4] +
                                        i[5] * f[5] +
                                        i[6] * f[6] +
                                        i[7] * f[7] +
                                        o           ;
                                } // s
                            } // r
                            int r_start = R / 8 * 8 ;
                            for (int r = r_start; r < R ; ++r) {
                                for (int s = 0; s < S ; ++s) {
                                    int h = oh * stride - pad + r * dilation;
                                    int w = ow * stride - pad + s * dilation;
                                    if (h < 0 || h >= H || w < 0 || w >= W) continue;
                                    float i = input [i_base + h * W + w];
                                    float f = filter[f_base + r * S + s];
                                    o += i * f;
                                } // s
                            } // r
                        } // c
                        output[o_idx] = o;
                    } //ow
                } // oh
            } // k
        } //
    }
    if (mpi_rank == 0 && mpi_world_size == 2) {
        MPI_Recv(&output[size[0]*K*OH*OW], size[1]*K*OH*OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);
    }
    else if(mpi_world_size == 2){
        MPI_Isend(output, size[1]*K*OH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
    N = _N; C = _C; H = _H; W = _W;
    K = _K; R = _R; S = _S;
    pad = _pad;
    dilation = _dilation;
    stride = _stride;

    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
}