chundoong-lab-ta/SamsungDS22/submissions/final/jb114.seo/tmp-B/convolution.cu

#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>

#include "util.h"

// NVIDIA GeForce RTX 3090
// - Computing capability: 8.6
// - Maximum number of threads per block: 1024
// - Maximum amount of shared memory per SM: 100KB
// - Maximum amount of shared memory per thread block : 99KB
// - Maximum amount of local memory per thread: 512KB
// Maximum x-dimension of a grid of thread blocks: 2^31-1
// Maximum y- or z-dimension of a grid of thread blocks: 65535
// - In a SM:
//     128 FP32 cores in a SM
//     64 INT32 cores for integer math
//     4 warp schedulers


#define MAX_NUM_GPU           4
#define MAX_THREAD_PER_BLOCK  8
#define MAX_NUM_STREAM        4

#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )
#define CEIL(x,y)     ( CEIL_DIV((x),(y)) * (y) )
#define MIN(a,b)      ( ((a) < (b)) ? (a) : (b) )

#define CUDA_LOG(fmt, ...)        //printf(fmt, ##__VA_ARGS__)
#define CUDA_KERN_LOG(fmt, ...)   //printf(fmt, ##__VA_ARGS__)


#define CUDA_CALL(d, f)                                                                       \
  {                                                                                           \
    cudaError_t err = (f);                                                                    \
    if (err != cudaSuccess) {                                                                 \
      fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n",                      \
              mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err));               \
      exit(1);                                                                                \
    }                                                                                         \
  }

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int num_devices;

static int *startN;
static int *sizeN;

// Array of device (GPU) pointers
static float *i_d[MAX_NUM_GPU];
static float *f_d[MAX_NUM_GPU];
static float *o_d[MAX_NUM_GPU];

cudaStream_t streams[MAX_NUM_GPU][MAX_NUM_STREAM];

__global__ void conv_core(float *in, float *flt, float *out,
  int N, int K, int C, int H, int W, int R, int S, int OH, int OW,
  int stride, int pad, int dilation)
{
  const int col = threadIdx.x;
  const int row = threadIdx.y;
  const int globalCol = blockDim.x * blockIdx.x + col;
  const int globalRow = blockDim.y * blockIdx.y + row;

  if (globalCol < OW && globalRow < OH)
  {
    for (int n = 0; n < N; ++n)
    {
      for (int k = 0; k < K; ++k)
      {
        float *outForK = &out[n * K * OH * OW + k * OH * OW];

        float o = 0.0f;
        for (int c = 0; c < C; ++c)
        {
          float *inForC = &in[n * C * H * W + c * H * W];
          float *fltForC = &flt[k * C * R * S + c * R * S];

          for (int r = 0; r < R; r++)
          {
            for (int s = 0; s < S; s++)
            {
              int h = globalRow * stride - pad + r * dilation;
              int w = globalCol * stride - pad + s * dilation;
              if (h < 0 || h >= H || w < 0 || w >= W) continue;

              float i = inForC[h * W + w];
              float f = fltForC[r * S + s];

              o += i * f;
              // printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",
              //   globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);
            }
          }
        }

        outForK[globalRow * OW + globalCol] = o;

        CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",
          n, k, globalRow, globalCol,
          &outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);
      }
    }
  }
}

void cuda_alloc(float **t, int D0, int D1, int D2, int D3)
{
  CUDA_CALL( 0, cudaHostAlloc((void **) t, sizeof(float) * D0 * D1 * D2 * D3, cudaHostAllocDefault) );

  if (*t == NULL)
  {
    printf("Failed to allocate memory for matrix.\n");
    exit(0);
  }
}

void cuda_free(float *t)
{
  CUDA_CALL( 0, cudaFreeHost(t) );
}

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  if (mpi_rank == 0)
  {
    input = _input;
    output = _output;
    filter = _filter;
  }

  ////////////////////////////////////////////////////////////////////////////////
  // scatter
  MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);

  if (mpi_rank == 0)
  {

    for (int i = 1; i < mpi_world_size; i++)
    {
      const int sizeOfN = sizeN[i] * C * H * W;
      float *inputForNodes = input + (startN[i] * C * H * W);

      if (sizeOfN == 0)
      {
        continue;
      }

      MPI_Send(inputForNodes, sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);
    }
  }
  else
  {
    if (N > 0)
    {
      MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // calc
  int slicedK = K / num_devices;
  int startK[MAX_NUM_GPU];
  int sizeK[MAX_NUM_GPU];
  for (int d = 0; d < num_devices; d++)
  {
    startK[d] = slicedK * d;
    sizeK[d] = slicedK;
  }
  sizeK[num_devices - 1] = K - startK[num_devices - 1];


  for (int d = 0; d < num_devices; d++)
  {
    int streamID = 0;

    if (N <= 0 || sizeK[d] <= 0)
    {
      continue;
    }

    CUDA_CALL( d, cudaSetDevice(d) );

    // Upload filter data to every GPU
    CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",
      mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));

    CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],
      sizeK[d] * C * R * S * sizeof(float),
      cudaMemcpyHostToDevice, streams[d][streamID]) );
  }

  for (int streamN = 0; streamN < N; streamN++)
  {
    int streamID = streamN % MAX_NUM_STREAM;
    
    for (int d = 0; d < num_devices; d++)
    {
      if (N <= 0 || sizeK[d] <= 0)
      {
        continue;
      }

      CUDA_CALL( d, cudaSetDevice(d) );

      // Upload input data to every GPU
      CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",
        mpi_rank, d,
        i_d[d] + streamN * C * H * W, input + streamN * C * H * W,
        C * H * W * sizeof(float), C * H * W * sizeof(float));

      CUDA_CALL( d, cudaMemcpyAsync(i_d[d] + streamN * C * H * W, input + streamN * C * H * W,
        C * H * W * sizeof(float),
        cudaMemcpyHostToDevice, streams[d][streamID]) );
    }

    for (int d = 0; d < num_devices; d++)
    {
      if (N <= 0 || sizeK[d] <= 0)
      {
        continue;
      }

      CUDA_CALL( d, cudaSetDevice(d) );

      // Launch kernel on every GPU
      dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);
      dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);

      CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"
        ", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"
        ", stride=%d, pad=%d, dilation=%d\n",
        mpi_rank, d,
        i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,
        1, sizeK[d], C, H, W, R, S, OH, OW,
        stride, pad, dilation);

      conv_core<<<gridDim, blockDim, 0, streams[d][streamID]>>>(
        i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,
        1, sizeK[d], C, H, W, R, S, OH, OW,
        stride, pad, dilation);
    }

    // Download output data from GPUs
    for (int d = 0; d < num_devices; d++)
    {
      if (N <= 0 || sizeK[d] <= 0)
      {
        continue;
      }

      CUDA_CALL( d, cudaSetDevice(d) );

      // for (int n = 0; n < N; n++)
      {
        CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",
          mpi_rank, d,
          output + (streamN * K * OH * OW) + (startK[d] * OH * OW), o_d[d] + streamN * sizeK[d] * OH * OW,
          sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));

        CUDA_CALL( d, cudaMemcpyAsync(output + (streamN * K * OH * OW) + (startK[d] * OH * OW),
          o_d[d] + streamN * sizeK[d] * OH * OW,
          sizeK[d] * OH * OW * sizeof(float),
          cudaMemcpyDeviceToHost, streams[d][streamID]) );
      }
    }
  }

  for (int d = 0; d < num_devices; d++)
  {
    CUDA_CALL( d, cudaSetDevice(d) );

    CUDA_CALL( d, cudaDeviceSynchronize() );
  }

  ////////////////////////////////////////////////////////////////////////////////
  // gather

  if (mpi_rank == 0)
  {
    for (int i = 1; i < mpi_world_size; i++)
    {
      const int sizeOfN = sizeN[i] * K * OH * OW;
      float *outputForNodes = output + (startN[i] * K * OH * OW);

      if (sizeOfN == 0)
      {
        continue;
      }
      
      MPI_Recv(outputForNodes, sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
  }
  else
  {
    if (N > 0)
    {
      MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
    }
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  alloc_tensor((float**)&startN, mpi_world_size, 1, 1, 1);
  alloc_tensor((float**)&sizeN, mpi_world_size, 1, 1, 1);

  int slicedN = (_N <= mpi_world_size) ? 1 : ((_N * 3) / (mpi_world_size * 4));
  int offsetN = 0;
  for (int i = 0; i < mpi_world_size; i++)
  {
    startN[i] = offsetN;
    sizeN[i] = MIN(slicedN, _N - startN[i]);

    offsetN += sizeN[i];
  }
  sizeN[mpi_world_size - 1] = _N - startN[mpi_world_size - 1];
  N = sizeN[mpi_rank];

  if (mpi_rank == 0)
  {
  }
  else
  {
    if (N > 0)
    {
      cuda_alloc(&input, N, C, H, W);
      cuda_alloc(&output, N, K, OH, OW);
    }

    cuda_alloc(&filter, K, C, R, S);
  }

  CUDA_LOG("[Node %d] N = %d, _N = %d, sclicedN = %d, startN=%d, sizeN=%d\n",
    mpi_rank, N, _N, slicedN, startN[mpi_rank], sizeN[mpi_rank]);

  CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );
  if (num_devices > MAX_NUM_GPU)
  {
    num_devices = MAX_NUM_GPU;
  }

  if (num_devices <= 0)
  {
    printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);
    exit(1);
  }

  if (N > 0)
  {
    // Allocate device memory for each GPU
    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) );  // 32 64 256 256
      CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) );   // 64 64 16 16
      CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      for (int s = 0; s < MAX_NUM_STREAM; s++)
      {
        cudaStreamCreate(&streams[d][s]);
      }

      CUDA_CALL( d, cudaDeviceSynchronize() );
    }
  }
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride)
{
  // CUDA
  if (N > 0)
  {
    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaDeviceSynchronize() );
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      CUDA_CALL( d, cudaFree(i_d[d]) );
      CUDA_CALL( d, cudaFree(f_d[d]) );
      CUDA_CALL( d, cudaFree(o_d[d]) );
    }

    for (int d = 0; d < num_devices; d++)
    {
      CUDA_CALL( d, cudaSetDevice(d) );

      for (int s = 0; s < MAX_NUM_STREAM; s++)
      {
        cudaStreamDestroy(streams[d][s]);
      }

      CUDA_CALL( d, cudaDeviceSynchronize() );
    }
  }

  // MPI
  MPI_Barrier(MPI_COMM_WORLD);

  if (mpi_rank == 0)
  {
  }
  else
  {
    if (N > 0)
    {
        cuda_free(input);
        cuda_free(output);
    }

    cuda_free(filter);
  }

  free(startN);
  free(sizeN);
}
. 2022-09-29 18:01:45 +09:00			`#include "convolution.h"`
			`#include <mpi.h>`
			`#include <stdio.h>`
			`#include <cuda_runtime.h>`

			`#include "util.h"`

			`// NVIDIA GeForce RTX 3090`
			`// - Computing capability: 8.6`
			`// - Maximum number of threads per block: 1024`
			`// - Maximum amount of shared memory per SM: 100KB`
			`// - Maximum amount of shared memory per thread block : 99KB`
			`// - Maximum amount of local memory per thread: 512KB`
			`// Maximum x-dimension of a grid of thread blocks: 2^31-1`
			`// Maximum y- or z-dimension of a grid of thread blocks: 65535`
			`// - In a SM:`
			`// 128 FP32 cores in a SM`
			`// 64 INT32 cores for integer math`
			`// 4 warp schedulers`



			`#define MAX_NUM_GPU 4`
			`#define MAX_THREAD_PER_BLOCK 8`
			`#define MAX_NUM_STREAM 4`

			`#define CEIL_DIV(x,y) ( ((x) + (y) - 1) / (y) )`
			`#define CEIL(x,y) ( CEIL_DIV((x),(y)) * (y) )`
			`#define MIN(a,b) ( ((a) < (b)) ? (a) : (b) )`

			`#define CUDA_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)`
			`#define CUDA_KERN_LOG(fmt, ...) //printf(fmt, ##__VA_ARGS__)`


			`#define CUDA_CALL(d, f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) { \`
			`fprintf(stderr, "[Node %d][GPU %d] CUDA error at [%s:%d] %d %s\n", \`
			`mpi_rank, (d), __FILE__, __LINE__, err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`

			`static float input, output, *filter;`
			`static int N, C, H, W;`
			`static int K, R, S;`
			`static int OH, OW;`
			`static int pad;`
			`static int dilation;`
			`static int stride;`
			`static int mpi_rank, mpi_world_size;`
			`static int num_devices;`

			`static int *startN;`
			`static int *sizeN;`

			`// Array of device (GPU) pointers`
			`static float *i_d[MAX_NUM_GPU];`
			`static float *f_d[MAX_NUM_GPU];`
			`static float *o_d[MAX_NUM_GPU];`

			`cudaStream_t streams[MAX_NUM_GPU][MAX_NUM_STREAM];`

			`__global__ void conv_core(float in, float flt, float *out,`
			`int N, int K, int C, int H, int W, int R, int S, int OH, int OW,`
			`int stride, int pad, int dilation)`
			`{`
			`const int col = threadIdx.x;`
			`const int row = threadIdx.y;`
			`const int globalCol = blockDim.x * blockIdx.x + col;`
			`const int globalRow = blockDim.y * blockIdx.y + row;`

			`if (globalCol < OW && globalRow < OH)`
			`{`
			`for (int n = 0; n < N; ++n)`
			`{`
			`for (int k = 0; k < K; ++k)`
			`{`
			`float outForK = &out[n K * OH * OW + k * OH * OW];`

			`float o = 0.0f;`
			`for (int c = 0; c < C; ++c)`
			`{`
			`float inForC = &in[n C * H * W + c * H * W];`
			`float fltForC = &flt[k C * R * S + c * R * S];`

			`for (int r = 0; r < R; r++)`
			`{`
			`for (int s = 0; s < S; s++)`
			`{`
			`int h = globalRow * stride - pad + r * dilation;`
			`int w = globalCol * stride - pad + s * dilation;`
			`if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W) continue;`

			`float i = inForC[h * W + w];`
			`float f = fltForC[r * S + s];`

			`o += i * f;`
			`// printf("GR:%03d, GC:%03d, OH:%03d, OW:%03d, C:%03d, R:%03d, S:%03d, H:%03d, W:%03d, I:%f, F:%f, O:%f\n",`
			`// globalRow, globalCol, oh, ow, c, r, s, h, w, i, f, o);`
			`}`
			`}`
			`}`

			`outForK[globalRow * OW + globalCol] = o;`

			`CUDA_KERN_LOG("N:%03d, K:%03d, GR:%03d, GC:%03d, o:%p, o:%f\n",`
			`n, k, globalRow, globalCol,`
			`&outForK[globalRow * OW + globalCol], outForK[globalRow * OW + globalCol]);`
			`}`
			`}`
			`}`
			`}`

			`void cuda_alloc(float **t, int D0, int D1, int D2, int D3)`
			`{`
			`CUDA_CALL( 0, cudaHostAlloc((void *) t, sizeof(float) D0 * D1 * D2 * D3, cudaHostAllocDefault) );`

			`if (*t == NULL)`
			`{`
			`printf("Failed to allocate memory for matrix.\n");`
			`exit(0);`
			`}`
			`}`

			`void cuda_free(float *t)`
			`{`
			`CUDA_CALL( 0, cudaFreeHost(t) );`
			`}`

			`void convolution(`
			`float _input, float _output, float *_filter,`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`if (mpi_rank == 0)`
			`{`
			`input = _input;`
			`output = _output;`
			`filter = _filter;`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// scatter`
			`MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);`

			`if (mpi_rank == 0)`
			`{`

			`for (int i = 1; i < mpi_world_size; i++)`
			`{`
			`const int sizeOfN = sizeN[i] * C * H * W;`
			`float inputForNodes = input + (startN[i] C * H * W);`

			`if (sizeOfN == 0)`
			`{`
			`continue;`
			`}`

			`MPI_Send(inputForNodes, sizeOfN, MPI_FLOAT, i, 1, MPI_COMM_WORLD);`
			`}`
			`}`
			`else`
			`{`
			`if (N > 0)`
			`{`
			`MPI_Recv(input, N * C * H * W, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);`
			`}`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// calc`
			`int slicedK = K / num_devices;`
			`int startK[MAX_NUM_GPU];`
			`int sizeK[MAX_NUM_GPU];`
			`for (int d = 0; d < num_devices; d++)`
			`{`
			`startK[d] = slicedK * d;`
			`sizeK[d] = slicedK;`
			`}`
			`sizeK[num_devices - 1] = K - startK[num_devices - 1];`


			`for (int d = 0; d < num_devices; d++)`
			`{`
			`int streamID = 0;`

			`if (N <= 0 \|\| sizeK[d] <= 0)`
			`{`
			`continue;`
			`}`

			`CUDA_CALL( d, cudaSetDevice(d) );`

			`// Upload filter data to every GPU`
			`CUDA_LOG("[Node %d][GPU %d] CopyToDev: f_d=%p, filter=%p, size=%lu(0x%lX)\n",`
			`mpi_rank, d, f_d[d], &filter[startK[d] * C * R * S], sizeK[d] * C * R * S * sizeof(float), sizeK[d] * C * R * S * sizeof(float));`

			`CUDA_CALL( d, cudaMemcpyAsync(f_d[d], &filter[startK[d] * C * R * S],`
			`sizeK[d] * C * R * S * sizeof(float),`
			`cudaMemcpyHostToDevice, streams[d][streamID]) );`
			`}`

			`for (int streamN = 0; streamN < N; streamN++)`
			`{`
			`int streamID = streamN % MAX_NUM_STREAM;`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`if (N <= 0 \|\| sizeK[d] <= 0)`
			`{`
			`continue;`
			`}`

			`CUDA_CALL( d, cudaSetDevice(d) );`

			`// Upload input data to every GPU`
			`CUDA_LOG("[Node %d][GPU %d] CopyToDev: i_d=%p, input=%p, size=%lu(0x%lX)\n",`
			`mpi_rank, d,`
			`i_d[d] + streamN * C * H * W, input + streamN * C * H * W,`
			`C * H * W * sizeof(float), C * H * W * sizeof(float));`

			`CUDA_CALL( d, cudaMemcpyAsync(i_d[d] + streamN * C * H * W, input + streamN * C * H * W,`
			`C * H * W * sizeof(float),`
			`cudaMemcpyHostToDevice, streams[d][streamID]) );`
			`}`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`if (N <= 0 \|\| sizeK[d] <= 0)`
			`{`
			`continue;`
			`}`

			`CUDA_CALL( d, cudaSetDevice(d) );`

			`// Launch kernel on every GPU`
			`dim3 blockDim(MAX_THREAD_PER_BLOCK, MAX_THREAD_PER_BLOCK, 1);`
			`dim3 gridDim(CEIL_DIV(OW, MAX_THREAD_PER_BLOCK), CEIL_DIV(OH, MAX_THREAD_PER_BLOCK), 1);`

			`CUDA_LOG("[Node %d][GPU %d] KickKernel: i_d=%p, f_d=%p, o_d=%p"`
			`", N=%d, sizeK[d]=%d, C=%d, H=%d, W=%d, R=%d, S=%d, OH=%d, OW=%d"`
			`", stride=%d, pad=%d, dilation=%d\n",`
			`mpi_rank, d,`
			`i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,`
			`1, sizeK[d], C, H, W, R, S, OH, OW,`
			`stride, pad, dilation);`

			`conv_core<<<gridDim, blockDim, 0, streams[d][streamID]>>>(`
			`i_d[d] + streamN * C * H * W, f_d[d], o_d[d] + streamN * sizeK[d] * OH * OW,`
			`1, sizeK[d], C, H, W, R, S, OH, OW,`
			`stride, pad, dilation);`
			`}`

			`// Download output data from GPUs`
			`for (int d = 0; d < num_devices; d++)`
			`{`
			`if (N <= 0 \|\| sizeK[d] <= 0)`
			`{`
			`continue;`
			`}`

			`CUDA_CALL( d, cudaSetDevice(d) );`

			`// for (int n = 0; n < N; n++)`
			`{`
			`CUDA_LOG("[Node %d][GPU %d] CopyFromDev output=%p, o_d=%p, size=%lu(0x%lX)\n",`
			`mpi_rank, d,`
			`output + (streamN * K * OH * OW) + (startK[d] * OH * OW), o_d[d] + streamN * sizeK[d] * OH * OW,`
			`sizeK[d] * OH * OW * sizeof(float), sizeK[d] * OH * OW * sizeof(float));`

			`CUDA_CALL( d, cudaMemcpyAsync(output + (streamN * K * OH * OW) + (startK[d] * OH * OW),`
			`o_d[d] + streamN * sizeK[d] * OH * OW,`
			`sizeK[d] * OH * OW * sizeof(float),`
			`cudaMemcpyDeviceToHost, streams[d][streamID]) );`
			`}`
			`}`
			`}`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`CUDA_CALL( d, cudaDeviceSynchronize() );`
			`}`

			`////////////////////////////////////////////////////////////////////////////////`
			`// gather`

			`if (mpi_rank == 0)`
			`{`
			`for (int i = 1; i < mpi_world_size; i++)`
			`{`
			`const int sizeOfN = sizeN[i] * K * OH * OW;`
			`float outputForNodes = output + (startN[i] K * OH * OW);`

			`if (sizeOfN == 0)`
			`{`
			`continue;`
			`}`

			`MPI_Recv(outputForNodes, sizeOfN, MPI_FLOAT, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);`
			`}`
			`}`
			`else`
			`{`
			`if (N > 0)`
			`{`
			`MPI_Send(output, N * K * OH * OW, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);`
			`}`
			`}`
			`}`

			`void convolution_init(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`C = _C; H = _H; W = _W;`
			`K = _K; R = _R; S = _S;`
			`pad = _pad;`
			`dilation = _dilation;`
			`stride = _stride;`

			`OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`alloc_tensor((float**)&startN, mpi_world_size, 1, 1, 1);`
			`alloc_tensor((float**)&sizeN, mpi_world_size, 1, 1, 1);`

			`int slicedN = (_N <= mpi_world_size) ? 1 : ((_N * 3) / (mpi_world_size * 4));`
			`int offsetN = 0;`
			`for (int i = 0; i < mpi_world_size; i++)`
			`{`
			`startN[i] = offsetN;`
			`sizeN[i] = MIN(slicedN, _N - startN[i]);`

			`offsetN += sizeN[i];`
			`}`
			`sizeN[mpi_world_size - 1] = _N - startN[mpi_world_size - 1];`
			`N = sizeN[mpi_rank];`

			`if (mpi_rank == 0)`
			`{`
			`}`
			`else`
			`{`
			`if (N > 0)`
			`{`
			`cuda_alloc(&input, N, C, H, W);`
			`cuda_alloc(&output, N, K, OH, OW);`
			`}`

			`cuda_alloc(&filter, K, C, R, S);`
			`}`

			`CUDA_LOG("[Node %d] N = %d, _N = %d, sclicedN = %d, startN=%d, sizeN=%d\n",`
			`mpi_rank, N, _N, slicedN, startN[mpi_rank], sizeN[mpi_rank]);`

			`CUDA_CALL( 0, cudaGetDeviceCount(&num_devices) );`
			`if (num_devices > MAX_NUM_GPU)`
			`{`
			`num_devices = MAX_NUM_GPU;`
			`}`

			`if (num_devices <= 0)`
			`{`
			`printf("[Node %d] No CUDA device found. Aborting\n", mpi_rank);`
			`exit(1);`
			`}`

			`if (N > 0)`
			`{`
			`// Allocate device memory for each GPU`
			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`CUDA_CALL( d, cudaMalloc(&i_d[d], sizeof(float) * CEIL(N * C * H * W, 8)) ); // 32 64 256 256`
			`CUDA_CALL( d, cudaMalloc(&f_d[d], sizeof(float) * CEIL(K * C * R * S, 8)) ); // 64 64 16 16`
			`CUDA_CALL( d, cudaMalloc(&o_d[d], sizeof(float) * CEIL(N * K * OH * OW, 8)) ); // 32 64 256 256`
			`}`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`for (int s = 0; s < MAX_NUM_STREAM; s++)`
			`{`
			`cudaStreamCreate(&streams[d][s]);`
			`}`

			`CUDA_CALL( d, cudaDeviceSynchronize() );`
			`}`
			`}`
			`}`

			`void convolution_final(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride)`
			`{`
			`// CUDA`
			`if (N > 0)`
			`{`
			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`CUDA_CALL( d, cudaDeviceSynchronize() );`
			`}`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`CUDA_CALL( d, cudaFree(i_d[d]) );`
			`CUDA_CALL( d, cudaFree(f_d[d]) );`
			`CUDA_CALL( d, cudaFree(o_d[d]) );`
			`}`

			`for (int d = 0; d < num_devices; d++)`
			`{`
			`CUDA_CALL( d, cudaSetDevice(d) );`

			`for (int s = 0; s < MAX_NUM_STREAM; s++)`
			`{`
			`cudaStreamDestroy(streams[d][s]);`
			`}`

			`CUDA_CALL( d, cudaDeviceSynchronize() );`
			`}`
			`}`

			`// MPI`
			`MPI_Barrier(MPI_COMM_WORLD);`

			`if (mpi_rank == 0)`
			`{`
			`}`
			`else`
			`{`
			`if (N > 0)`
			`{`
			`cuda_free(input);`
			`cuda_free(output);`
			`}`

			`cuda_free(filter);`
			`}`

			`free(startN);`
			`free(sizeN);`
			`}`