#include "matmul.h"
#include "util.h"

#include <cuda_runtime.h>
#include <mpi.h>

#define CUDA_CALL(f)                                                           \
  {                                                                            \
    cudaError_t err = (f);                                                     \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
              err, cudaGetErrorString(err));                                   \
      exit(1);                                                                 \
    }                                                                          \
  }

#define MAX_NUM_GPU 4
int num_devices = 0;

static int mpi_rank, mpi_world_size;
static int Asendcounts[4];
static int Adispls[4];
static int Crecvcounts[4];
static int Cdispls[4];

// Array of device (GPU) pointers
static float *a_d[MAX_NUM_GPU];
static float *b_d[MAX_NUM_GPU];
static float *c_d[MAX_NUM_GPU];
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];

cudaStream_t streams[MAX_NUM_GPU];

#define BLOCK_SIZE 32
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
__global__ void matmul_kernel(float *A, float *B, float *C, int M, int N, int K) {
  int j = blockIdx.x * blockDim.x + threadIdx.x;
  int i = blockIdx.y * blockDim.y + threadIdx.y;

  int gj = blockIdx.x;
  int gi = blockIdx.y;

  if (gi * BLOCK_SIZE >= M || gj * BLOCK_SIZE >= N) return; // boundary check 

  int lj = threadIdx.x;
  int li = threadIdx.y;

  __shared__ float Alocal[BLOCK_SIZE][BLOCK_SIZE];
  __shared__ float Blocal[BLOCK_SIZE][BLOCK_SIZE];

  float c = 0.f;

  int A_row_index = (gi * BLOCK_SIZE + li);
  int B_col_index = (gj * BLOCK_SIZE + lj);

  for (int bk = 0; bk < K; bk += BLOCK_SIZE) {
    int A_col_index = bk + lj;
    Alocal[li][lj] = (A_row_index < M && A_col_index < K) ?
      A[A_row_index * K + A_col_index] :
      0.f;

    int B_row_index = bk + li;
    Blocal[li][lj] = (B_row_index < K && B_col_index < N) ?
      B[B_row_index * N + B_col_index] : 
      0.f;

    __syncthreads();

    for (int lk = 0; lk < BLOCK_SIZE; ++lk) {
        c  += Alocal[li][lk] * Blocal[lk][lj];
    }
    __syncthreads();
  }

  if (i < M && j < N)
    C[i * N + j] = c;
}

void matmul(const float *A, const float *B, float *C, int M, int N, int K) {

  MPI_Scatterv(A, Asendcounts, Adispls,
      MPI_FLOAT, (void*)A, Asendcounts[mpi_rank], MPI_FLOAT, 
      0, MPI_COMM_WORLD);
  MPI_Bcast((void*)B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD);

  // Upload A and B matrix to every GPU
#pragma omp parallel for
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));

    CUDA_CALL(cudaMemcpyAsync(
          a_d[i], A + Mbegin[i] * K,
          (Mend[i] - Mbegin[i]) * K * sizeof(float),
          cudaMemcpyHostToDevice,
          streams[i]));
    CUDA_CALL(cudaMemcpyAsync(b_d[i], B, K * N * sizeof(float),
          cudaMemcpyHostToDevice, streams[i]));

    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (Mend[i] - Mbegin[i] + BLOCK_SIZE-1) / BLOCK_SIZE);

    matmul_kernel<<<gridDim, blockDim, 0, streams[i]>>>(a_d[i], b_d[i], c_d[i], Mend[i] - Mbegin[i], N, K);

    CUDA_CALL(cudaMemcpyAsync(C + Mbegin[i] * N, c_d[i],
          (Mend[i] - Mbegin[i]) * N * sizeof(float),
          cudaMemcpyDeviceToHost, streams[i]));
  }

#pragma omp parallel for
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaStreamSynchronize(streams[i]));
  }

  MPI_Gatherv(C, Crecvcounts[mpi_rank], MPI_FLOAT,
      C, Crecvcounts, Cdispls, MPI_FLOAT,
      0, MPI_COMM_WORLD);
}

void matmul_initialize(int M, int N, int K) {
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  for (int i = 0; i < mpi_world_size; i++) {
    Adispls[i] = ((M / mpi_world_size) * K) * i;
    Asendcounts[i] = ((M / mpi_world_size) * K);
    Cdispls[i] = ((M / mpi_world_size) * N) * i;
    Crecvcounts[i] = ((M / mpi_world_size) * N);
  }
  Asendcounts[mpi_world_size - 1] = M*K - Adispls[mpi_world_size-1];
  Crecvcounts[mpi_world_size - 1] = M*N - Cdispls[mpi_world_size-1];

  // Only root process do something
  CUDA_CALL(cudaGetDeviceCount(&num_devices));

  int num_global_devices = 0;
  MPI_Reduce(&num_devices, (void*)&num_global_devices, 1, MPI_INT,
      MPI_SUM, 0, MPI_COMM_WORLD);

  if (mpi_rank == 0) {
    printf("Using %d devices\n", num_devices);
  }
  MPI_Barrier(MPI_COMM_WORLD);

  for (int j = 0; j < mpi_world_size; ++j) {
    if (mpi_rank == j) {
      for (int i = 0; i < num_devices; i++) {
        cudaDeviceProp prop;
        CUDA_CALL(cudaGetDeviceProperties(&prop, i));

        // Try printing more detailed information here
        printf("[rank %d] GPU %d: %s\n", mpi_rank, i, prop.name);
      }
    }
    MPI_Barrier(MPI_COMM_WORLD);
  }

  if (num_devices <= 0) {
    printf("[rank %d] No CUDA device found. Aborting\n", mpi_rank);
    exit(1);
  }

  // Setup problem size for each GPU
  for (int i = 0; i < num_devices; i++) {
    Mbegin[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * i;
    Mend[i] = ((Asendcounts[mpi_rank] / K) / num_devices) * (i + 1);
  }
  Mend[num_devices - 1] = (Asendcounts[mpi_rank] / K);

  // Allocate device memory for each GPU
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMalloc(&a_d[i], (Mend[i] - Mbegin[i]) * K * sizeof(float)));
    CUDA_CALL(cudaMalloc(&b_d[i], K * N * sizeof(float)));
    CUDA_CALL(cudaMalloc(&c_d[i], (Mend[i] - Mbegin[i]) * N * sizeof(float)));

    CUDA_CALL(cudaStreamCreate(&streams[i]));
  }
}

void matmul_finalize() {
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL(cudaFree(a_d[i]));
    CUDA_CALL(cudaFree(b_d[i]));
    CUDA_CALL(cudaFree(c_d[i]));

    CUDA_CALL(cudaStreamDestroy(streams[i]));
  }
}