#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>

#include "util.h"

// NVIDIA GeForce RTX 3090
// - Computing capability: 8.6
// - Maximum number of threads per block: 1024
// - Maximum amount of shared memory per SM: 100KB
// - Maximum amount of shared memory per thread block : 99KB
// - Maximum amount of local memory per thread: 512KB
// Maximum x-dimension of a grid of thread blocks: 2^31-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// - In a SM:
//     128 FP32 cores in a SM
//     64 INT32 cores for integer math
//     4 warp schedulers


#define MAX_NUM_GPU  4
#define MAX_THREAD_PER_BLOCK 8  // 16 x 16 = 256

#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y)     ( CEIL_DIV((x),(y)) * (y) )
#define MIN(a,b)      ( ((a) < (b)) ? (a) : (b) )

#define CUDA_LOG(fmt, ...)        //printf(fmt, ##__VA_ARGS__)
#define CUDA_KERN_LOG(fmt, ...)   //printf(fmt, ##__VA_ARGS__)


#define CUDA_CALL(d, f)                                                                       \
  {                                                                                           \
    cudaError_t err = (f);                                                                    \
    if (err != cudaSuccess) {                                                                 \
      fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n",                      \
              mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err));               \
      exit(1);                                                                                \
    }                                                                                         \
  }

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int num_devices;

// Array of device (GPU) pointers
static float *i_d[MAX_NUM_GPU];
static float *f_d[MAX_NUM_GPU];
static float *o_d[MAX_NUM_GPU];

__global__ void conv_core(float *in, float *flt, float *out,
  int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
  int stride, int pad, int dilation)
{
  const int col = threadIdx.x;
  const int row = threadIdx.y;
  const int globalCol = blockDim.x * blockIdx.x + col;
  const int globalRow = blockDim.y * blockIdx.y + row;

  if (globalCol < OW && globalRow < OH)
  {
    for (int n = 0; n < N; ++n)
    {
      for (int k = 0; k < K; ++k)
      {
        float *outForK = &out[n * K * OH * OW + k * OH * OW];

        float o = 0.0f;
        for (int c = 0; c < C; ++c)
        {
          float *inForC = &in[n * C * H * W + c * H * W];
          float *fltForC = &flt[k * C * R * S + c * R * S];

          for (int r = 0; r < R; r++)
          {
            for (int s = 0; s < S; s++)
            {
              int h = globalRow * stride - pad + r * dilation;
              int w = globalCol * stride - pad + s * dilation;
              if (h < 0 || h >= H || w < 0 || w >= W) continue;

              float i = inForC[h * W + w];
              float f = fltForC[r * S + s];

              o += i * f;
              // printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
              //   globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
            }
          }
        }

        outForK[globalRow * OW + globalCol] = o;

        CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
          n, k, globalRow, globalCol,
          &outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
      }
    }
  }
}


#define MPI_ASYNC 1

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
#if MPI_ASYNC == 1
  MPI_Request reqs[3] = { MPI_REQUEST_NULL, MPI_REQUEST_NULL, MPI_REQUEST_NULL };
#endif

  const int slicedN = _N / mpi_world_size;

  ////////////////////////////////////////////////////////////////////////////////
  // scatter
  if (mpi_rank == 0)
  {
    input = _input;
    output = _output;
    filter = _filter;
  }

  if (slicedN > 0)
  {
#if MPI_ASYNC == 1
    MPI_Ibcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD, &reqs[0]);
#else
    MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
#endif

    if (mpi_rank == 0)
    {
      const int sizeOfN = slicedN * C * H * W;
      float *inputForNodes = input + (N * C * H * W);

      for (int i = 1; i < mpi_world_size; i++)
      {
#if MPI_ASYNC == 1
        MPI_Isend(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &reqs[1]);
#else
        MPI_Send(inputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
#endif
      }
    }
    else
    {
#if MPI_ASYNC == 1
      MPI_Irecv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &reqs[1]);
#else
      MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#endif

      // zero_tensor(output, N, K, OH, OW);
    }

#if MPI_ASYNC == 1
    MPI_Waitall(2, reqs, MPI_STATUSES_IGNORE);
#endif
  }

  ////////////////////////////////////////////////////////////////////////////////
  // calc
  int slicedK = K / num_devices;
  int startK[MAX_NUM_GPU];
  int sizeK[MAX_NUM_GPU];
  for (int d = 0; d < num_devices; d++)
  {
    startK[d] = slicedK * d;
    sizeK[d] = slicedK;
  }
  sizeK[num_devices - 1] = K - startK[num_devices - 1];

  for (int d = 0; d < num_devices; d++)
  {
    if (N <= 0 || sizeK[d] <= 0)
    {
      continue;
    }

    CUDA_CALL( d, cudaSetDevice(d) );

    // Upload input data to every GPU
    CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
      mpi_rank, d, i_d[d], input, N * C * H * W * sizeof(float), N * C * H * W * sizeof(float));

    CUDA_CALL( d, cudaMemcpyAsync(i_d[d], input,
      N * C * H * W * sizeof(float),
      cudaMemcpyHostToDevice) );

    // Upload filter data to every GPU
    CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
      mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));

    CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
      sizeK[d] * C * R * S * sizeof(float),
      cudaMemcpyHostToDevice) );

    // Launch kernel on every GPU
    dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
    dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);

    CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
      ", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
      ", stride=%d, pad=%d, dilation=%d\n",
      mpi_rank, d, i_d[d], f_d[d], o_d[d],
      N, sizeK[d], C, H, W, R, S, OH, OW,
      stride, pad, dilation);

    conv_core<<<gridDim, blockDim>>>(i_d[d], f_d[d], o_d[d],
      N, sizeK[d], C, H, W, R, S, OH, OW,
      stride, pad, dilation);
  }

  // Download output data from GPUs
  for (int d = 0; d < num_devices; d++)
  {
    if (N <= 0 || sizeK[d] <= 0)
    {
      continue;
    }

    CUDA_CALL( d, cudaSetDevice(d) );

    for (int n = 0; n < N; n++)
    {
      CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
        mpi_rank, d, &output[(n * K * OH * OW) + (startK[d] * OH * OW)], o_d[d] + n * sizeK[d] * OH * OW,
        sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));

      CUDA_CALL( d, cudaMemcpyAsync(&output[(n * K * OH * OW) + (startK[d] * OH * OW)],
        o_d[d] + n * sizeK[d] * OH * OW,
        sizeK[d] * OH * OW * sizeof(float),
        cudaMemcpyDeviceToHost) );
    }
  }

  for (int d = 0; d < num_devices; d++)
  {
    CUDA_CALL( d, cudaDeviceSynchronize() );
  }

  ////////////////////////////////////////////////////////////////////////////////
  // gather
  if (slicedN > 0)
  {
    if (mpi_rank == 0)
    {
      const int sizeOfN = slicedN * K * OH * OW;
      float *outputForNodes = output + (N * K * OH * OW);

      for (int i = 1; i < mpi_world_size; i++)
      {
#if MPI_ASYNC == 1
        MPI_Irecv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, &reqs[2]);
#else
        MPI_Recv(outputForNodes + sizeOfN * (i - 1), sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
#endif
      }
    }
    else
    {
#if MPI_ASYNC == 1
      MPI_Isend(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, &reqs[2]);
#else
      MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
#endif
    }

#if MPI_ASYNC == 1
    MPI_Wait(&reqs[2], MPI_STATUS_IGNORE);
#endif
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  N = _N / mpi_world_size;

  if (mpi_rank == 0)
  {
    N = _N - N * (mpi_world_size - 1);
  }
  else
  {
    if (N > 0)
    {
      alloc_tensor(&input, N, C, H, W);
      alloc_tensor(&filter, K, C, R, S);
      alloc_tensor(&output, N, K, OH, OW);
    }
  }

  CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
  if (num_devices > MAX_NUM_GPU)
  {
    num_devices = MAX_NUM_GPU;
  }

  if (num_devices <= 0)
  {
    printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
    exit(1);
  }

  if (N > 0)
  {
    // Allocate device memory for each GPU
    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) );  // 32 64 256 256
      CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) );   // 64 64 16 16
      CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaDeviceSynchronize() );
    }
  }
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  // CUDA
  if (N > 0)
  {
    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaDeviceSynchronize() );
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaFree(i_d[d]) );
      CUDA_CALL( d, cudaFree(f_d[d]) );
      CUDA_CALL( d, cudaFree(o_d[d]) );
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaDeviceSynchronize() );
    }
  }

  // MPI
  MPI_Barrier(MPI_COMM_WORLD);

  if (N > 0)
  {
    if (mpi_rank != 0)
    {
        free(input);
        free(filter);
        free(output);
    }
  }
}