chundoong-lab-ta/SamsungDS22/submissions/final/g.kwak/tmp-B/convolution.cu

#include "convolution.h"
#include <mpi.h>
#include <cstdio>
#include <cuda_runtime.h>

#define CUDA_CALL(f)                                                           \
  {                                                                            \
    cudaError_t err = (f);                                                     \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
              err, cudaGetErrorString(err));                                   \
      exit(1);                                                                 \
    }                                                                          \
  }


#define TS 8
#define MAX_NUM_GPU 4
int num_devices = 0;
static float *a_d[MAX_NUM_GPU];
static float *b_d[MAX_NUM_GPU];
static float *c_d[MAX_NUM_GPU];
static int Msize[MAX_NUM_GPU], Mbegin[MAX_NUM_GPU];
static int mpi_size[2];

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;

static int CHW   ;
static int KCRS  ;
static int KOHOW ;


#include "util.h"
#define MASTER 0
#define FROM_MASTER 1
#define FROM_WORKER 2


__global__ void sgemm
(   float *input, float *output, float *filter,
    int N, int C, int H, int W,
    int K, int R, int S,
    int pad, int dilation, int stride) {

  int OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  int OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  const int row_global = blockDim.x * blockIdx.x + threadIdx.x;
  const int col_global = blockDim.y * blockIdx.y + threadIdx.y;

  int KOW = K*OW;

  int n   =                           col_global /  (KOW);
  int k   =                          (col_global - n*KOW) / OW;
  int col = (col_global - n*KOW) - ( (col_global - n*KOW) / OW * OW );
  int row =  row_global;

  int col_init = col * stride - pad;
  int row_init = row * stride - pad;

  if (row_global >= OH || col_global >= N*KOW) return;
  float o = 0.f;
  for (int c = 0; c < C; ++c) {
    for (int r = 0; r < R; ++r) {
      for (int s = 0; s < S; ++s) {
        int h = row_init + r * dilation;
        int w = col_init + s * dilation;
        if (h < 0 || h >= H || w < 0 || w >= W) continue;
        float in   =  input[n * C * H * W  +  c * H * W   +   h * W  +  w];
        float filt = filter[k * C * R * S  +  c * R * S   +   r * S  +  s];
        o += in * filt;
      }
    }
  }
  output[n * K *OH *OW   +   k *OH *OW   +  row *OW   + col] = o;
  //printf("\nout= %f, %f,  %f  \n", output[0], output[1], output[0]);
}


// Array of device (GPU) pointers
void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride )
{

  input = _input;
  output = _output;
  filter = _filter;

  int dest;
  int numworkers, mtype;
  MPI_Status status;
  MPI_Request request;

  numworkers = mpi_world_size-1;
//////////////////////////////////

      mtype  = FROM_MASTER;
  if(mpi_size[1] != 0) {
    if(mpi_rank == 0) {

      for (dest=1; dest<=numworkers; dest++)
      {
        MPI_Isend(&input[mpi_size[0]*CHW] ,mpi_size[1]*CHW, MPI_FLOAT, dest, mtype, MPI_COMM_WORLD, &request);
        MPI_Isend(&filter[0]         ,    KCRS, MPI_FLOAT, dest, mtype, MPI_COMM_WORLD, &request);
      }
    }
    else {
      alloc_tensor(&input , mpi_size[1], C, H, W);
      alloc_tensor(&output, mpi_size[1], K, OH, OW);
      alloc_tensor(&filter, K, C, R, S);
        MPI_Recv(&input[0]     , mpi_size[1]*CHW, MPI_FLOAT, MASTER , mtype, MPI_COMM_WORLD, &status);
        MPI_Recv(&filter[0]    ,     KCRS, MPI_FLOAT, MASTER , mtype, MPI_COMM_WORLD, &status);
    }
  }


  // Upload input and filter matrix to every GPU
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaMemcpy(a_d[i], input + Mbegin[i] * CHW,
                                   Msize[i] * CHW * sizeof(float), cudaMemcpyHostToDevice) );
    CUDA_CALL( cudaMemcpy(b_d[i], filter,
                                             KCRS * sizeof(float), cudaMemcpyHostToDevice) );
  //printf("\n3= %f, %f,  %f  \n", input[0], input[1], filter[0]);
  }

  /*
  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaDeviceSynchronize() );
  }
  */


////////////// upper init///////////////////
  // Launch kernel on every GPU
  for (int i = 0; i < num_devices; i++) {
    dim3 gridDim( (OH  +TS-1)/TS, (Msize[i]*K*OW  +TS-1)/TS,1);
    dim3 blockDim( TS,TS,1);


    CUDA_CALL( cudaSetDevice(i) );
    sgemm<<<gridDim, blockDim>>>(a_d[i], c_d[i], b_d[i], Msize[i],
                    _C, _H, _W, _K, _R, _S, _pad, _dilation, _stride);
  }

  /*
  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaDeviceSynchronize() );
  }
  */
////////////// upper init///////////////////


  // Download C matrix from GPUs
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaMemcpy(output + Mbegin[i] * KOHOW, c_d[i],
                             Msize[i] * KOHOW * sizeof(float), cudaMemcpyDeviceToHost) );
  }
   // printf("\n4= %f, %f,  %f  \n", output[0], output[1], output[0]);

  /*
  // DO NOT REMOVE; NEEDED FOR TIME MEASURE
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaDeviceSynchronize() );
  }
  */


  if(mpi_size[1] != 0) {
    if(mpi_rank == 0) {
      MPI_Recv (&output[mpi_size[0]*KOHOW], mpi_size[1]*KOHOW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);
    }
    else {
      MPI_Isend(&output[0], mpi_size[1]*KOHOW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
    }
  }


}


void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{

  N = _N; C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  CHW   = C*H*W;
  KCRS  = K*C*R*S;
  KOHOW = K*OH*OW;


  if (mpi_world_size == 2 && N > 4) mpi_size[1] = N/2;
  else                              mpi_size[1] = 0;
  mpi_size[0] = N - mpi_size[1];

  if (mpi_size[mpi_rank] < MAX_NUM_GPU) {
      num_devices = mpi_size[mpi_rank];
      for (int i = 0 ; i < mpi_size[mpi_rank] ; i++)
      {
          Msize[i]  = 1;
          Mbegin[i] = i;
      }
  }
  else {
        int offset = 0;
        int Ndiv       = mpi_size[mpi_rank]/MAX_NUM_GPU;
        int Ndiv_extra = mpi_size[mpi_rank]%MAX_NUM_GPU;

        for (int i = 0 ; i < MAX_NUM_GPU ; i++) {
            Msize[i] = Ndiv;
            if (i < Ndiv_extra) Msize[i]++;
            Mbegin[i] = offset;
            offset += Msize[i];
        }
  }


  CUDA_CALL( cudaGetDeviceCount(&num_devices) );

  //printf("Using %d devices\n", num_devices);
  for (int i = 0; i < num_devices; i++) {
    cudaDeviceProp prop;
    CUDA_CALL( cudaGetDeviceProperties(&prop, i) );

    // Try printing more detailed information here
    //printf("[GPU %d] %s\n", i, prop.name);
  }

  if (num_devices <= 0) {
    //printf("No CUDA device found. Aborting\n");
    exit(1);
  }

  // Allocate device memory for each GPU
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaSetDevice(i) );
    CUDA_CALL( cudaMalloc(&a_d[i], Msize[i] * CHW * sizeof(float)) );
    CUDA_CALL( cudaMalloc(&b_d[i],           KCRS * sizeof(float)) );
    CUDA_CALL( cudaMalloc(&c_d[i], Msize[i] *KOHOW* sizeof(float)) );
 // printf("\n2= %i, %i,  %i  \n", Msize[i] * CHW ,KCRS, Msize[i]*KOHOW);
  }


}


void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,

    int _pad, int _dilation, int _stride)
{
  // Do any post-matmul cleanup work here.


}