chundoong-lab-ta/SamsungDS22/submissions/final/yw0.kim/B/convolution.cu

#include "convolution.h"
#include "util.h"
#include <mpi.h>
#include <cstdio>
#include <cuda_runtime.h>
#include <omp.h>

#define NAIVE 0
#define OPTIMIZED 1

#define PRINT_DEBUG 0
// #define KERNEL_VERSION NAIVE
#define KERNEL_VERSION OPTIMIZED

#define CUDA_CALL(f)                                                       \
  {                                                                        \
    cudaError_t err = (f);                                                 \
    if (err != cudaSuccess)                                                \
    {                                                                      \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
              err, cudaGetErrorString(err));                               \
      exit(1);                                                             \
    }                                                                      \
  }

#define MAX_NUM_GPU 4
#define NUM_THREADS_PER_NODE 40
#define CTILE_SIZE 8
#define RTILE_SIZE 16
#define STILE_SIZE 16

#define min(A, B) (((A) > (B)) ? (B) : (A))

__global__ void kernel_convolution_naive(
    int mpi_rank,
    float *input, float *output, float *filter,
    int N, int C, int H, int W,
    int K, int R, int S, int OH, int OW,
    int pad, int dilation, int stride)
{
  int oh = blockIdx.x;
  int ow = blockIdx.y;

  for (int n = 0; n < N; n++)
  {
    for (int k = 0; k < K; k++)
    {
      float o = 0.0;
      for (int c = 0; c < C; c++)
      {
        for (int r = 0; r < R; r++)
        {
          for (int s = 0; s < S; s++)
          {
            int h = oh * stride - pad + r * dilation;
            int w = ow * stride - pad + s * dilation;
            if (h < 0 || h >= H || w < 0 || w >= W)
              continue;
            float i = input[n * C * H * W + c * H * W + h * W + w];
            float f = filter[k * C * R * S + c * R * S + r * S + s];
            o += i * f;
          }
        }
      }

      output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
    }
  }
}

__global__ void kernel_convolution(
    int mpi_rank,
    float *input, float *output, float *filter,
    int N, int C, int H, int W,
    int K, int R, int S, int OH, int OW,
    int pad, int dilation, int stride)
{
  int lr = threadIdx.y;
  int ls = threadIdx.x;
  int oh = blockIdx.x * RTILE_SIZE + lr;
  int ow = blockIdx.y * STILE_SIZE + ls;
  int n = blockIdx.z / K;
  int k = blockIdx.z % K;

  __shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE];

  float o = 0.0;
  for (int rtile = 0; rtile < R; rtile += RTILE_SIZE)
  {
    for (int stile = 0; stile < S; stile += STILE_SIZE)
    {
      for (int ctile = 0; ctile < C; ctile += CTILE_SIZE)
      {
        int r = rtile + lr;
        int s = stile + ls;
        int climit = min(ctile + CTILE_SIZE, C);
        for (int c = ctile; c < climit; c++)
        {
          int lc = c % CTILE_SIZE;
          if (r < R && s < S)
          {
            filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s];
          }
          else
          {
            filter_shared[lc][lr][ls] = 0.0;
          }
        }
        __syncthreads();

        for (int c = ctile; c < climit; c++)
        {
          for (int r = 0; r < RTILE_SIZE; r++)
          {
            for (int s = 0; s < STILE_SIZE; s++)
            {
              int lc = c % CTILE_SIZE;
              int h = oh * stride - pad + (rtile + r) * dilation;
              int w = ow * stride - pad + (stile + s) * dilation;
              if (h < 0 || h >= H || w < 0 || w >= W)
                continue;
              float i = input[n * C * H * W + c * H * W + h * W + w];
              float f = filter_shared[lc][r][s];
              o += i * f;
            }
          }
        }
        __syncthreads();
      }
    }
  }

  if (oh >= 0 && oh < OH && ow >= 0 && ow < OW)
  {
    output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
  }
}

static int num_devices = 0;
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int nstart, nend, nlen;

static MPI_Request request;
static MPI_Status status;

static float *input_d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];
static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU];

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  if (mpi_rank == 0)
  {
    input = _input;
    output = _output;
    filter = _filter;

#if (PRINT_DEBUG == 1)
    print_filter(filter, K, C, R, S);
    print_input(input, nlen, C, H, W);
#endif
  }

  // printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen);
  // cuda_device_malloc();

  if (mpi_world_size == 1)
  {
    cuda_memcpy_host_to_device();
    cuda_kernel_call();
    cuda_memcpy_device_to_host();
  }
  else
  {
    if (mpi_rank == 0)
    {
      int dest = 1;
      nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
      nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
      nlen = nend - nstart;

      MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
      MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);

      nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
      nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
      nlen = nend - nstart;
    }
    else
    {
      int source = 0;
      MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
      zero_tensor(output, nlen, K, OH, OW);
    }

    cuda_memcpy_host_to_device();
    cuda_kernel_call();
    cuda_memcpy_device_to_host();

    if (mpi_rank == 0)
    {
      int source = 1;
      nstart = N / mpi_world_size * source + min(source, N % mpi_world_size);
      nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size);
      nlen = nend - nstart;
      MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
    }
    else
    {
      int dest = 0;
      MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
    }
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  N = _N;
  C = _C;
  H = _H;
  W = _W;
  K = _K;
  R = _R;
  S = _S;

  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  omp_set_num_threads(NUM_THREADS_PER_NODE);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  if (mpi_world_size == 1)
  {
    // Only 1 node is usable
    // Set start and end index for node #0
    nstart = 0;
    nend = N;
    nlen = nend - nstart;
  }
  else
  {
    // 2 nodes are available
    if (mpi_rank == 0)
    {
      int dest = 1;
      nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
      nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);

      // Send start and end index to node #1
      MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
      MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);

      nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
      nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
      nlen = nend - nstart;
    }
    else
    {
      int source = 0;
      // Receive start and end index from node #0
      MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      nlen = nend - nstart;

      alloc_tensor(&input, nlen, C, H, W);
      alloc_tensor(&output, nlen, K, OH, OW);
      alloc_tensor(&filter, K, C, R, S);
    }
  }

  cuda_device_init();
  cuda_device_malloc();
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
}

void cuda_device_init(void)
{
  CUDA_CALL(cudaGetDeviceCount(&num_devices));
  // num_devices = 1;

  /// printf("Node #%d Using %d devices\n", mpi_rank, num_devices);

  // for (int i = 0; i < num_devices; i++)
  // {
  //   cudaDeviceProp prop;
  //   CUDA_CALL(cudaGetDeviceProperties(&prop, i));

  //   // Try printing more detailed information here
  //   // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name);
  // }

  if (num_devices <= 0)
  {
    printf("No CUDA device found. Aborting\n");
    exit(1);
  }

// Setup problem size for each GPU
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices);
    Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices);
    Nlen[i] = Nend[i] - Nbegin[i];
#if (PRINT_DEBUG == 1)
    printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]);
#endif
  }
}

void cuda_device_malloc(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float)));
    CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float)));
    CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float)));
  }
}

void cuda_memcpy_host_to_device(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMemcpy(input_d[i],
                         input + Nbegin[i] * C * H * W,
                         Nlen[i] * C * H * W * sizeof(float),
                         cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(filter_d[i],
                         filter,
                         K * C * R * S * sizeof(float),
                         cudaMemcpyHostToDevice));
  }
}

void cuda_memcpy_device_to_host(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW,
                         output_d[i],
                         Nlen[i] * K * OH * OW * sizeof(float),
                         cudaMemcpyDeviceToHost));
  }

#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaDeviceSynchronize());
  }
}

void cuda_kernel_call(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
#if (KERNEL_VERSION == NAIVE)
    // Naive Version
    dim3 blockDim(1, 1);
    dim3 gridDim(OH, OW);
#elif (KERNEL_VERSION == OPTIMIZED)
    // Optimized Version
    dim3 blockDim(STILE_SIZE, RTILE_SIZE);
    dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K);
#endif

#if (PRINT_DEBUG == 1)
    printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y);
    printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z);
#endif

    CUDA_CALL(cudaSetDevice(i));
#if (KERNEL_VERSION == NAIVE)
    kernel_convolution_naive<<<gridDim, blockDim>>>(mpi_rank,
                                                    input_d[i], output_d[i], filter_d[i],
                                                    Nlen[i], C, H, W,
                                                    K, R, S, OH, OW,
                                                    pad, dilation, stride);
#elif (KERNEL_VERSION == OPTIMIZED)
    kernel_convolution<<<gridDim, blockDim>>>(mpi_rank,
                                              input_d[i], output_d[i], filter_d[i],
                                              Nlen[i], C, H, W,
                                              K, R, S, OH, OW,
                                              pad, dilation, stride);

#endif
  }

#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaDeviceSynchronize());
  }
}

void print_filter(float *filter, int K, int C, int R, int S)
{
  printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S);
  for (int k = 0; k < K; k++)
  {
    for (int c = 0; c < C; c++)
    {
      printf("(k, c): (%d, %d)\n", k, c);
      for (int r = 0; r < R; r++)
      {
        for (int s = 0; s < S; s++)
        {
          printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]);
        }
        printf("\n");
      }
      printf("\n\n");
    }
  }
}

void print_input(float *input, int N, int C, int H, int W)
{
  printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W);
  for (int n = 0; n < N; n++)
  {
    for (int c = 0; c < C; c++)
    {
      printf("(n, c): (%d, %d)\n", n, c);
      for (int h = 0; h < H; h++)
      {
        for (int w = 0; w < W; w++)
        {
          printf("%f ", input[n * C * H * W + c * H * W + h * W + w]);
        }
        printf("\n");
      }
      printf("\n\n");
    }
  }
}