chundoong-lab-ta/SamsungDS22/submissions/final/yw0.kim/B/convolution.cu

#include "convolution.h"
#include "util.h"
#include <mpi.h>
#include <cstdio>
#include <cuda_runtime.h>
#include <omp.h>

#define NAIVE 0
#define OPTIMIZED 1

#define PRINT_DEBUG 0
// #define KERNEL_VERSION NAIVE
#define KERNEL_VERSION OPTIMIZED

#define CUDA_CALL(f)                                                       \
  {                                                                        \
    cudaError_t err = (f);                                                 \
    if (err != cudaSuccess)                                                \
    {                                                                      \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
              err, cudaGetErrorString(err));                               \
      exit(1);                                                             \
    }                                                                      \
  }

#define MAX_NUM_GPU 4
#define NUM_THREADS_PER_NODE 40
#define CTILE_SIZE 8
#define RTILE_SIZE 16
#define STILE_SIZE 16

#define min(A, B) (((A) > (B)) ? (B) : (A))

__global__ void kernel_convolution_naive(
    int mpi_rank,
    float *input, float *output, float *filter,
    int N, int C, int H, int W,
    int K, int R, int S, int OH, int OW,
    int pad, int dilation, int stride)
{
  int oh = blockIdx.x;
  int ow = blockIdx.y;

  for (int n = 0; n < N; n++)
  {
    for (int k = 0; k < K; k++)
    {
      float o = 0.0;
      for (int c = 0; c < C; c++)
      {
        for (int r = 0; r < R; r++)
        {
          for (int s = 0; s < S; s++)
          {
            int h = oh * stride - pad + r * dilation;
            int w = ow * stride - pad + s * dilation;
            if (h < 0 || h >= H || w < 0 || w >= W)
              continue;
            float i = input[n * C * H * W + c * H * W + h * W + w];
            float f = filter[k * C * R * S + c * R * S + r * S + s];
            o += i * f;
          }
        }
      }

      output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
    }
  }
}

__global__ void kernel_convolution(
    int mpi_rank,
    float *input, float *output, float *filter,
    int N, int C, int H, int W,
    int K, int R, int S, int OH, int OW,
    int pad, int dilation, int stride)
{
  int lr = threadIdx.y;
  int ls = threadIdx.x;
  int oh = blockIdx.x * RTILE_SIZE + lr;
  int ow = blockIdx.y * STILE_SIZE + ls;
  int n = blockIdx.z / K;
  int k = blockIdx.z % K;

  __shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE];

  float o = 0.0;
  for (int rtile = 0; rtile < R; rtile += RTILE_SIZE)
  {
    for (int stile = 0; stile < S; stile += STILE_SIZE)
    {
      for (int ctile = 0; ctile < C; ctile += CTILE_SIZE)
      {
        int r = rtile + lr;
        int s = stile + ls;
        int climit = min(ctile + CTILE_SIZE, C);
        for (int c = ctile; c < climit; c++)
        {
          int lc = c % CTILE_SIZE;
          if (r < R && s < S)
          {
            filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s];
          }
          else
          {
            filter_shared[lc][lr][ls] = 0.0;
          }
        }
        __syncthreads();

        for (int c = ctile; c < climit; c++)
        {
          for (int r = 0; r < RTILE_SIZE; r++)  
          {
            for (int s = 0; s < STILE_SIZE; s++)
            {
              int lc = c % CTILE_SIZE;
              int h = oh * stride - pad + (rtile + r) * dilation;
              int w = ow * stride - pad + (stile + s) * dilation;
              if (h < 0 || h >= H || w < 0 || w >= W)
                continue;
              float i = input[n * C * H * W + c * H * W + h * W + w];
              float f = filter_shared[lc][r][s];
              o += i * f;
            }
          }
        }
        __syncthreads();
      }
    }
  }

  if (oh >= 0 && oh < OH && ow >= 0 && ow < OW)
  {
    output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;
  }
}

static int num_devices = 0;
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int nstart, nend, nlen;

static MPI_Request request;
static MPI_Status status;

static float *input_d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];
static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU];

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  if (mpi_rank == 0)
  {
    input = _input;
    output = _output;
    filter = _filter;

#if (PRINT_DEBUG == 1)
    print_filter(filter, K, C, R, S);
    print_input(input, nlen, C, H, W);
#endif
  }

  // printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen);
  // cuda_device_malloc();

  if (mpi_world_size == 1)
  {
    cuda_memcpy_host_to_device();
    cuda_kernel_call();
    cuda_memcpy_device_to_host();
  }
  else
  {
    if (mpi_rank == 0)
    {
      int dest = 1;
      nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
      nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);
      nlen = nend - nstart;
      
      MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
      MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);

      nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
      nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
      nlen = nend - nstart;
    }
    else
    {
      int source = 0;
      MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
      zero_tensor(output, nlen, K, OH, OW);
    }

    cuda_memcpy_host_to_device();
    cuda_kernel_call();
    cuda_memcpy_device_to_host();

    if (mpi_rank == 0)
    {
      int source = 1;
      nstart = N / mpi_world_size * source + min(source, N % mpi_world_size);
      nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size);
      nlen = nend - nstart;
      MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
    }
    else
    {
      int dest = 0;
      MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);
    }
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  N = _N;
  C = _C;
  H = _H;
  W = _W;
  K = _K;
  R = _R;
  S = _S;

  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  omp_set_num_threads(NUM_THREADS_PER_NODE);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  if (mpi_world_size == 1)
  {
    // Only 1 node is usable
    // Set start and end index for node #0
    nstart = 0;
    nend = N;
    nlen = nend - nstart;
  }
  else
  {
    // 2 nodes are available
    if (mpi_rank == 0)
    {
      int dest = 1;
      nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);
      nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);

      // Send start and end index to node #1
      MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);
      MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);

      nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);
      nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);
      nlen = nend - nstart;
    }
    else
    {
      int source = 0;
      // Receive start and end index from node #0
      MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
      nlen = nend - nstart;

      alloc_tensor(&input, nlen, C, H, W);
      alloc_tensor(&output, nlen, K, OH, OW);
      alloc_tensor(&filter, K, C, R, S);
    }
  }

  cuda_device_init();
  cuda_device_malloc();
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
}

void cuda_device_init(void)
{
  CUDA_CALL(cudaGetDeviceCount(&num_devices));
  // num_devices = 1;

  /// printf("Node #%d Using %d devices\n", mpi_rank, num_devices);

  // for (int i = 0; i < num_devices; i++)
  // {
  //   cudaDeviceProp prop;
  //   CUDA_CALL(cudaGetDeviceProperties(&prop, i));

  //   // Try printing more detailed information here
  //   // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name);
  // }

  if (num_devices <= 0)
  {
    printf("No CUDA device found. Aborting\n");
    exit(1);
  }

// Setup problem size for each GPU
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices);
    Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices);
    Nlen[i] = Nend[i] - Nbegin[i];
#if (PRINT_DEBUG == 1)
    printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]);
#endif
  }
}

void cuda_device_malloc(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float)));
    CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float)));
    CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float)));
  }
}

void cuda_memcpy_host_to_device(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMemcpy(input_d[i],
                         input + Nbegin[i] * C * H * W,
                         Nlen[i] * C * H * W * sizeof(float),
                         cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(filter_d[i],
                         filter,
                         K * C * R * S * sizeof(float),
                         cudaMemcpyHostToDevice));
  }
}

void cuda_memcpy_device_to_host(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW,
                         output_d[i],
                         Nlen[i] * K * OH * OW * sizeof(float),
                         cudaMemcpyDeviceToHost));
  }

#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaDeviceSynchronize());
  }
}

void cuda_kernel_call(void)
{
#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
#if (KERNEL_VERSION == NAIVE)
    // Naive Version
    dim3 blockDim(1, 1);
    dim3 gridDim(OH, OW);
#elif (KERNEL_VERSION == OPTIMIZED)
    // Optimized Version
    dim3 blockDim(STILE_SIZE, RTILE_SIZE);
    dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K);
#endif

#if (PRINT_DEBUG == 1)
    printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y);
    printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z);
#endif

    CUDA_CALL(cudaSetDevice(i));
#if (KERNEL_VERSION == NAIVE)
    kernel_convolution_naive<<<gridDim, blockDim>>>(mpi_rank,
                                                    input_d[i], output_d[i], filter_d[i],
                                                    Nlen[i], C, H, W,
                                                    K, R, S, OH, OW,
                                                    pad, dilation, stride);
#elif (KERNEL_VERSION == OPTIMIZED)
    kernel_convolution<<<gridDim, blockDim>>>(mpi_rank,
                                              input_d[i], output_d[i], filter_d[i],
                                              Nlen[i], C, H, W,
                                              K, R, S, OH, OW,
                                              pad, dilation, stride);

#endif
  }

#pragma parallel for
  for (int i = 0; i < num_devices; i++)
  {
    CUDA_CALL(cudaSetDevice(i));
    CUDA_CALL(cudaDeviceSynchronize());
  }
}

void print_filter(float *filter, int K, int C, int R, int S)
{
  printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S);
  for (int k = 0; k < K; k++)
  {
    for (int c = 0; c < C; c++)
    {
      printf("(k, c): (%d, %d)\n", k, c);
      for (int r = 0; r < R; r++)
      {
        for (int s = 0; s < S; s++)
        {
          printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]);
        }
        printf("\n");
      }
      printf("\n\n");
    }
  }
}

void print_input(float *input, int N, int C, int H, int W)
{
  printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W);
  for (int n = 0; n < N; n++)
  {
    for (int c = 0; c < C; c++)
    {
      printf("(n, c): (%d, %d)\n", n, c);
      for (int h = 0; h < H; h++)
      {
        for (int w = 0; w < W; w++)
        {
          printf("%f ", input[n * C * H * W + c * H * W + h * W + w]);
        }
        printf("\n");
      }
      printf("\n\n");
    }
  }
}
. 2022-09-29 18:01:45 +09:00			`#include "convolution.h"`
			`#include "util.h"`
			`#include <mpi.h>`
			`#include <cstdio>`
			`#include <cuda_runtime.h>`
			`#include <omp.h>`

			`#define NAIVE 0`
			`#define OPTIMIZED 1`

			`#define PRINT_DEBUG 0`
			`// #define KERNEL_VERSION NAIVE`
			`#define KERNEL_VERSION OPTIMIZED`

			`#define CUDA_CALL(f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) \`
			`{ \`
			`fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \`
			`err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`

			`#define MAX_NUM_GPU 4`
			`#define NUM_THREADS_PER_NODE 40`
			`#define CTILE_SIZE 8`
			`#define RTILE_SIZE 16`
			`#define STILE_SIZE 16`

			`#define min(A, B) (((A) > (B)) ? (B) : (A))`

			`__global__ void kernel_convolution_naive(`
			`int mpi_rank,`
			`float input, float output, float *filter,`
			`int N, int C, int H, int W,`
			`int K, int R, int S, int OH, int OW,`
			`int pad, int dilation, int stride)`
			`{`
			`int oh = blockIdx.x;`
			`int ow = blockIdx.y;`

			`for (int n = 0; n < N; n++)`
			`{`
			`for (int k = 0; k < K; k++)`
			`{`
			`float o = 0.0;`
			`for (int c = 0; c < C; c++)`
			`{`
			`for (int r = 0; r < R; r++)`
			`{`
			`for (int s = 0; s < S; s++)`
			`{`
			`int h = oh * stride - pad + r * dilation;`
			`int w = ow * stride - pad + s * dilation;`
			`if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W)`
			`continue;`
			`float i = input[n * C * H * W + c * H * W + h * W + w];`
			`float f = filter[k * C * R * S + c * R * S + r * S + s];`
			`o += i * f;`
			`}`
			`}`
			`}`

			`output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;`
			`}`
			`}`
			`}`

			`__global__ void kernel_convolution(`
			`int mpi_rank,`
			`float input, float output, float *filter,`
			`int N, int C, int H, int W,`
			`int K, int R, int S, int OH, int OW,`
			`int pad, int dilation, int stride)`
			`{`
			`int lr = threadIdx.y;`
			`int ls = threadIdx.x;`
			`int oh = blockIdx.x * RTILE_SIZE + lr;`
			`int ow = blockIdx.y * STILE_SIZE + ls;`
			`int n = blockIdx.z / K;`
			`int k = blockIdx.z % K;`

			`__shared__ float filter_shared[CTILE_SIZE][RTILE_SIZE][STILE_SIZE];`

			`float o = 0.0;`
			`for (int rtile = 0; rtile < R; rtile += RTILE_SIZE)`
			`{`
			`for (int stile = 0; stile < S; stile += STILE_SIZE)`
			`{`
			`for (int ctile = 0; ctile < C; ctile += CTILE_SIZE)`
			`{`
			`int r = rtile + lr;`
			`int s = stile + ls;`
			`int climit = min(ctile + CTILE_SIZE, C);`
			`for (int c = ctile; c < climit; c++)`
			`{`
			`int lc = c % CTILE_SIZE;`
			`if (r < R && s < S)`
			`{`
			`filter_shared[lc][lr][ls] = filter[k * C * R * S + c * R * S + r * S + s];`
			`}`
			`else`
			`{`
			`filter_shared[lc][lr][ls] = 0.0;`
			`}`
			`}`
			`__syncthreads();`

			`for (int c = ctile; c < climit; c++)`
			`{`
			`for (int r = 0; r < RTILE_SIZE; r++)`
			`{`
			`for (int s = 0; s < STILE_SIZE; s++)`
			`{`
			`int lc = c % CTILE_SIZE;`
			`int h = oh * stride - pad + (rtile + r) * dilation;`
			`int w = ow * stride - pad + (stile + s) * dilation;`
			`if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W)`
			`continue;`
			`float i = input[n * C * H * W + c * H * W + h * W + w];`
			`float f = filter_shared[lc][r][s];`
			`o += i * f;`
			`}`
			`}`
			`}`
			`__syncthreads();`
			`}`
			`}`
			`}`

			`if (oh >= 0 && oh < OH && ow >= 0 && ow < OW)`
			`{`
			`output[n * K * OH * OW + k * OH * OW + oh * OW + ow] = o;`
			`}`
			`}`

			`static int num_devices = 0;`
			`static float input, output, *filter;`
			`static int N, C, H, W;`
			`static int K, R, S;`
			`static int OH, OW;`
			`static int pad;`
			`static int dilation;`
			`static int stride;`
			`static int mpi_rank, mpi_world_size;`
			`static int nstart, nend, nlen;`

			`static MPI_Request request;`
			`static MPI_Status status;`

			`static float *input_d[MAX_NUM_GPU];`
			`static float *filter_d[MAX_NUM_GPU];`
			`static float *output_d[MAX_NUM_GPU];`
			`static int Nbegin[MAX_NUM_GPU], Nend[MAX_NUM_GPU], Nlen[MAX_NUM_GPU];`

			`void convolution(`
			`float _input, float _output, float *_filter,`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`if (mpi_rank == 0)`
			`{`
			`input = _input;`
			`output = _output;`
			`filter = _filter;`

			`#if (PRINT_DEBUG == 1)`
			`print_filter(filter, K, C, R, S);`
			`print_input(input, nlen, C, H, W);`
			`#endif`
			`}`

			`// printf("Node #%d (nstart, nend, nlen): (%d, %d, %d)\n", mpi_rank, nstart, nend, nlen);`
			`// cuda_device_malloc();`

			`if (mpi_world_size == 1)`
			`{`
			`cuda_memcpy_host_to_device();`
			`cuda_kernel_call();`
			`cuda_memcpy_device_to_host();`
			`}`
			`else`
			`{`
			`if (mpi_rank == 0)`
			`{`
			`int dest = 1;`
			`nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);`
			`nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);`
			`nlen = nend - nstart;`

			`MPI_Isend(&input[nstart * C * H * W], nlen * C * H * W, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);`
			`MPI_Isend(filter, K * C * R * S, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);`

			`nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);`
			`nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);`
			`nlen = nend - nstart;`
			`}`
			`else`
			`{`
			`int source = 0;`
			`MPI_Recv(input, nlen * C * H * W, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);`
			`MPI_Recv(filter, K * C * R * S, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);`
			`zero_tensor(output, nlen, K, OH, OW);`
			`}`

			`cuda_memcpy_host_to_device();`
			`cuda_kernel_call();`
			`cuda_memcpy_device_to_host();`

			`if (mpi_rank == 0)`
			`{`
			`int source = 1;`
			`nstart = N / mpi_world_size * source + min(source, N % mpi_world_size);`
			`nend = N / mpi_world_size * (source + 1) + min(source + 1, N % mpi_world_size);`
			`nlen = nend - nstart;`
			`MPI_Recv(&output[nstart * K * OH * OW], nlen * K * OH * OW, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);`
			`}`
			`else`
			`{`
			`int dest = 0;`
			`MPI_Isend(output, nlen * K * OH * OW, MPI_FLOAT, dest, 1, MPI_COMM_WORLD, &request);`
			`}`
			`}`
			`}`

			`void convolution_init(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`N = _N;`
			`C = _C;`
			`H = _H;`
			`W = _W;`
			`K = _K;`
			`R = _R;`
			`S = _S;`

			`pad = _pad;`
			`dilation = _dilation;`
			`stride = _stride;`

			`omp_set_num_threads(NUM_THREADS_PER_NODE);`
			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`if (mpi_world_size == 1)`
			`{`
			`// Only 1 node is usable`
			`// Set start and end index for node #0`
			`nstart = 0;`
			`nend = N;`
			`nlen = nend - nstart;`
			`}`
			`else`
			`{`
			`// 2 nodes are available`
			`if (mpi_rank == 0)`
			`{`
			`int dest = 1;`
			`nstart = N / mpi_world_size * dest + min(dest, N % mpi_world_size);`
			`nend = N / mpi_world_size * (dest + 1) + min(dest + 1, N % mpi_world_size);`

			`// Send start and end index to node #1`
			`MPI_Isend(&nstart, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);`
			`MPI_Isend(&nend, 1, MPI_INT, dest, 1, MPI_COMM_WORLD, &request);`

			`nstart = N / mpi_world_size * mpi_rank + min(mpi_rank, N % mpi_world_size);`
			`nend = N / mpi_world_size * (mpi_rank + 1) + min(mpi_rank + 1, N % mpi_world_size);`
			`nlen = nend - nstart;`
			`}`
			`else`
			`{`
			`int source = 0;`
			`// Receive start and end index from node #0`
			`MPI_Recv(&nstart, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);`
			`MPI_Recv(&nend, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);`
			`nlen = nend - nstart;`

			`alloc_tensor(&input, nlen, C, H, W);`
			`alloc_tensor(&output, nlen, K, OH, OW);`
			`alloc_tensor(&filter, K, C, R, S);`
			`}`
			`}`

			`cuda_device_init();`
			`cuda_device_malloc();`
			`}`

			`void convolution_final(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`}`

			`void cuda_device_init(void)`
			`{`
			`CUDA_CALL(cudaGetDeviceCount(&num_devices));`
			`// num_devices = 1;`

			`/// printf("Node #%d Using %d devices\n", mpi_rank, num_devices);`

			`// for (int i = 0; i < num_devices; i++)`
			`// {`
			`// cudaDeviceProp prop;`
			`// CUDA_CALL(cudaGetDeviceProperties(&prop, i));`

			`// // Try printing more detailed information here`
			`// // printf("Node #%d [GPU %d] %s\n", mpi_rank, i, prop.name);`
			`// }`

			`if (num_devices <= 0)`
			`{`
			`printf("No CUDA device found. Aborting\n");`
			`exit(1);`
			`}`

			`// Setup problem size for each GPU`
			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`Nbegin[i] = (nlen / num_devices) * i + min(i, nlen % num_devices);`
			`Nend[i] = (nlen / num_devices) * (i + 1) + min(i + 1, nlen % num_devices);`
			`Nlen[i] = Nend[i] - Nbegin[i];`
			`#if (PRINT_DEBUG == 1)`
			`printf("Node #%d Device #%d (Nbegin, Nend, Nlen): (%d, %d, %d)\n", mpi_rank, i, Nbegin[i], Nend[i], Nlen[i]);`
			`#endif`
			`}`
			`}`

			`void cuda_device_malloc(void)`
			`{`
			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaMalloc(&input_d[i], Nlen[i] * C * H * W * sizeof(float)));`
			`CUDA_CALL(cudaMalloc(&filter_d[i], K * C * R * S * sizeof(float)));`
			`CUDA_CALL(cudaMalloc(&output_d[i], Nlen[i] * K * OH * OW * sizeof(float)));`
			`}`
			`}`

			`void cuda_memcpy_host_to_device(void)`
			`{`
			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaMemcpy(input_d[i],`
			`input + Nbegin[i] * C * H * W,`
			`Nlen[i] * C * H * W * sizeof(float),`
			`cudaMemcpyHostToDevice));`
			`CUDA_CALL(cudaMemcpy(filter_d[i],`
			`filter,`
			`K * C * R * S * sizeof(float),`
			`cudaMemcpyHostToDevice));`
			`}`
			`}`

			`void cuda_memcpy_device_to_host(void)`
			`{`
			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaMemcpy(output + Nbegin[i] * K * OH * OW,`
			`output_d[i],`
			`Nlen[i] * K * OH * OW * sizeof(float),`
			`cudaMemcpyDeviceToHost));`
			`}`

			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaDeviceSynchronize());`
			`}`
			`}`

			`void cuda_kernel_call(void)`
			`{`
			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`#if (KERNEL_VERSION == NAIVE)`
			`// Naive Version`
			`dim3 blockDim(1, 1);`
			`dim3 gridDim(OH, OW);`
			`#elif (KERNEL_VERSION == OPTIMIZED)`
			`// Optimized Version`
			`dim3 blockDim(STILE_SIZE, RTILE_SIZE);`
			`dim3 gridDim((OH + RTILE_SIZE - 1) / RTILE_SIZE, (OW + STILE_SIZE - 1) / STILE_SIZE, Nlen[i] * K);`
			`#endif`

			`#if (PRINT_DEBUG == 1)`
			`printf("Device #%d (blockDim.x, blockDim.y): (%d, %d)\n", i, blockDim.x, blockDim.y);`
			`printf("Device #%d (gridDim.x, gridDim.y, gridDim.z): (%d, %d, %d)\n", i, gridDim.x, gridDim.y, gridDim.z);`
			`#endif`

			`CUDA_CALL(cudaSetDevice(i));`
			`#if (KERNEL_VERSION == NAIVE)`
			`kernel_convolution_naive<<<gridDim, blockDim>>>(mpi_rank,`
			`input_d[i], output_d[i], filter_d[i],`
			`Nlen[i], C, H, W,`
			`K, R, S, OH, OW,`
			`pad, dilation, stride);`
			`#elif (KERNEL_VERSION == OPTIMIZED)`
			`kernel_convolution<<<gridDim, blockDim>>>(mpi_rank,`
			`input_d[i], output_d[i], filter_d[i],`
			`Nlen[i], C, H, W,`
			`K, R, S, OH, OW,`
			`pad, dilation, stride);`

			`#endif`
			`}`

			`#pragma parallel for`
			`for (int i = 0; i < num_devices; i++)`
			`{`
			`CUDA_CALL(cudaSetDevice(i));`
			`CUDA_CALL(cudaDeviceSynchronize());`
			`}`
			`}`

			`void print_filter(float *filter, int K, int C, int R, int S)`
			`{`
			`printf("--- FILTER (K, C, R, S): (%d, %d, %d, %d) --- \n", K, C, R, S);`
			`for (int k = 0; k < K; k++)`
			`{`
			`for (int c = 0; c < C; c++)`
			`{`
			`printf("(k, c): (%d, %d)\n", k, c);`
			`for (int r = 0; r < R; r++)`
			`{`
			`for (int s = 0; s < S; s++)`
			`{`
			`printf("%f ", filter[k * C * R * S + c * R * S + r * S + s]);`
			`}`
			`printf("\n");`
			`}`
			`printf("\n\n");`
			`}`
			`}`
			`}`

			`void print_input(float *input, int N, int C, int H, int W)`
			`{`
			`printf("--- INPUT (N, K, H, W): (%d, %d, %d, %d) --- \n", N, K, H, W);`
			`for (int n = 0; n < N; n++)`
			`{`
			`for (int c = 0; c < C; c++)`
			`{`
			`printf("(n, c): (%d, %d)\n", n, c);`
			`for (int h = 0; h < H; h++)`
			`{`
			`for (int w = 0; w < W; w++)`
			`{`
			`printf("%f ", input[n * C * H * W + c * H * W + h * W + w]);`
			`}`
			`printf("\n");`
			`}`
			`printf("\n\n");`
			`}`
			`}`
			`}`