chundoong-lab-ta/SamsungDS22/submissions/final/c.m.lee/B/convolution.cu

#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "util.h"

#define CUDA_CALL(f)                                                           \
  {                                                                            \
    cudaError_t err = (f);                                                     \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
              err, cudaGetErrorString(err));                                   \
      exit(1);                                                                 \
    }                                                                          \
  }

// definitions
#define MAX_NODE_COUNT  2
#define MAX_GPU_COUNT   4
#define SGEMM_BLOCKSIZE 16
#define IM2COL_NTHREADS 1024

// MPI's per-node variables
static int num_devices;
static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad, dilation, stride;
static int mpi_rank, mpi_world_size;
static int wbegin[MAX_NODE_COUNT], wend[MAX_NODE_COUNT];
static int wsize[MAX_NODE_COUNT];
static int mbegin[MAX_NODE_COUNT][MAX_GPU_COUNT];
static int mend[MAX_NODE_COUNT][MAX_GPU_COUNT];
static int msize[MAX_NODE_COUNT][MAX_GPU_COUNT];
static int rounded_M, rounded_N;

// GPU devices' variables
static float *h_input[MAX_GPU_COUNT];
static float *h_output[MAX_GPU_COUNT];
static float *d_input[MAX_GPU_COUNT];
static float *d_filter[MAX_GPU_COUNT];
static float *d_col[MAX_GPU_COUNT];
static float *d_output[MAX_GPU_COUNT];

static cudaStream_t stream[MAX_GPU_COUNT];

// function declaration
__global__ void im2col_gpu_kernel(const int n, const float *data_im, 
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad, const int stride, const int dilation, const int height_col,
    const int width_col, float* data_col);

__global__ void sgemm_gpu_kernel(float *A, float *B, float *C,
    int M, int N, int K);

// static function
static void convolution_gpu(void) {
  int n;

  int im2col_kernels = C * OH * OW;
  int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS;

  dim3 sgemm_blockDim(SGEMM_BLOCKSIZE, SGEMM_BLOCKSIZE);
  dim3 sgemm_gridDim(rounded_N/SGEMM_BLOCKSIZE, rounded_M/SGEMM_BLOCKSIZE);

  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaMemcpy(d_filter[i], filter, K * C * R * S * sizeof(float),
                          cudaMemcpyHostToDevice) );
  }

  for (n = 0; n < msize[mpi_rank][0]; n++) {
    for (int i = 0; i < num_devices; i++) {
      if (n >= msize[mpi_rank][i]) {
        continue;
      }
      CUDA_CALL( cudaSetDevice(i) );
      CUDA_CALL( cudaMemcpyAsync(d_input[i],
                                 h_input[i] + n * C * H * W,
                                 C * H * W * sizeof(float),
                                 cudaMemcpyHostToDevice, stream[i]) );

      im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[i]>>>(
        im2col_kernels,
        d_input[i],
        H, W, R, S,
        pad, stride, dilation, OH, OW, d_col[i]);

      sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[i]>>>(
        d_filter[i], d_col[i],
        d_output[i],
        K, OH * OW, R * S * C);
      
      CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW,
                                 d_output[i],
                                 K * OH * OW * sizeof(float),
                                 cudaMemcpyDeviceToHost, stream[i]) );
    }
  }

  int d = num_devices - 1;
  for (n = n; n < msize[mpi_rank][d]; n++) {
    CUDA_CALL( cudaSetDevice(d) );
    CUDA_CALL( cudaMemcpyAsync(d_input[d],
                               h_input[d] + n * C * H * W,
                               C * H * W * sizeof(float),
                               cudaMemcpyHostToDevice, stream[d]) );

    im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[d]>>>(
      im2col_kernels,
      d_input[d],
      H, W, R, S,
      pad, stride, dilation, OH, OW, d_col[d]);

    sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[d]>>>(
      d_filter[d], d_col[d],
      d_output[d],
      K, OH * OW, R * S * C);

    CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW,
                               d_output[d],
                               K * OH * OW * sizeof(float),
                               cudaMemcpyDeviceToHost, stream[d]) );
  }

  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaSetDevice(i) );
    CUDA_CALL( cudaDeviceSynchronize() );
  }
}

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
  input = _input;
  output = _output;
  filter = _filter;

  MPI_Request req[MAX_NODE_COUNT][MAX_GPU_COUNT];

  if (mpi_rank != 0) {
    alloc_tensor(&filter, K, C, R, S);
  }

  // scattering
  if (mpi_rank == 0) {
    for (int i = 1; i < mpi_world_size; i++) {
      for (int j = 0; j < num_devices; j++) {
        MPI_Isend(input + (wbegin[i] + mbegin[i][j]) * C * H * W,
                  msize[i][j] * C * H * W,
                  MPI_FLOAT, i, 0, MPI_COMM_WORLD, &req[i - 1][j]);
      }
    }

    for (int i = 0; i < num_devices; i++) {
      memcpy(h_input[i],
             input + (wbegin[0] + mbegin[0][i]) * C * H * W,
             msize[0][i] * C * H * W * sizeof(float));
    }

    if (mpi_world_size > 1) {
      MPI_Waitall((mpi_world_size - 1) * num_devices,
                  &req[0][0],
                  MPI_STATUSES_IGNORE);
    }
  } else {
    for (int i = 0; i < num_devices; i++) {
      MPI_Irecv(h_input[i],
                msize[mpi_rank][i] * C * H * W * sizeof(float),
                MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &req[0][i]);
    }

    if (wsize[mpi_rank] > 0) {
      MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);
    }
  }

  // broadcasting
  if (mpi_world_size > 1) {
    MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
  }

  // computation
  if (wsize[mpi_rank] > 0){
    convolution_gpu();
  }

  // gathering
  if (mpi_rank == 0) {
    for (int i = 1; i < mpi_world_size; i++) {
      for (int j = 0; j < num_devices; j++) {
        MPI_Irecv(output + (wbegin[i] + mbegin[i][j]) * K * OH * OW,
                  msize[i][j] * K * OH * OW,
                  MPI_FLOAT, i, 1, MPI_COMM_WORLD, &req[i - 1][j]);
      }
    }

    for (int i = 0; i < num_devices; i++) {
      memcpy(output + (wbegin[0] + mbegin[0][i]) * K * OH * OW,
             h_output[i],
             msize[0][i] * K * OH * OW * sizeof(float));
    }

    MPI_Waitall((mpi_world_size - 1) * num_devices,
                &req[0][0], MPI_STATUSES_IGNORE);
  } else {
    for (int i = 0; i < num_devices; i++) {
      MPI_Isend(h_output[i],
                msize[mpi_rank][i] * K * OH * OW,
                MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &req[0][i]);
    }

    MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);
  }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {
  N = _N; C = _C; H = _H; W = _W;
  K = _K; R = _R; S = _S;
  pad = _pad;
  dilation = _dilation;
  stride = _stride;

  OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
  OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

  rounded_M = (K + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;
  rounded_N = ((OH * OW) + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;

  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

  if (mpi_rank == 0) {
    printf("Using %d compute nodes\n", mpi_world_size);
  }

  CUDA_CALL( cudaGetDeviceCount(&num_devices) );
  
  printf("Node %d: Using %d devices\n", mpi_rank, num_devices);

  if (num_devices <= 0) {
    printf("No CUDA device. Aborting\n");
    exit(1);
  }

  // work distribution to nodes
  for (int i = 0; i < mpi_world_size; i++) {
    wbegin[i] = N / mpi_world_size * i;
    wend[i] = N / mpi_world_size * (i + 1);
  }
  wend[mpi_world_size - 1] = N;

  for (int i = 0; i < mpi_world_size; i++) {
    wsize[i] = wend[i] - wbegin[i];
  }

  // work distribution to devices
  for (int i = 0; i < mpi_world_size; i++) {
    for (int j = 0; j < num_devices; j++) {
      mbegin[i][j] = (wsize[i] / num_devices) * j;
      mend[i][j] = (wsize[i] / num_devices) * (j + 1);
    }
    mend[i][num_devices - 1] = wsize[i];
  }

  for (int i = 0; i < mpi_world_size; i++) {
    for (int j = 0; j < num_devices; j++) {
      msize[i][j] = mend[i][j] - mbegin[i][j];
    }
  }

  // memory allocation
  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaSetDevice(i) );

    CUDA_CALL( cudaStreamCreate(&stream[i]) );

    CUDA_CALL( cudaMallocHost(&h_input[i], msize[mpi_rank][i] * C * H * W * sizeof(float)) );
    CUDA_CALL( cudaMallocHost(&h_output[i], msize[mpi_rank][i] * K * OH * OW * sizeof(float)) );

    CUDA_CALL( cudaMalloc(&d_input[i], C * H * W * sizeof(float)) );
    CUDA_CALL( cudaMalloc(&d_filter[i], rounded_M * C * R * S * sizeof(float)) );
    CUDA_CALL( cudaMalloc(&d_output[i], rounded_M * rounded_N * sizeof(float)) );
    CUDA_CALL( cudaMalloc(&d_col[i], R * S * C * OH * OW * sizeof(float)) );

    CUDA_CALL( cudaMemset(d_filter[i], 0, rounded_M * C * R * S * sizeof(float)) );
    CUDA_CALL( cudaMemset(d_output[i], 0, rounded_M * rounded_N * sizeof(float)) );
    CUDA_CALL( cudaMemset(d_col[i], 0, R * S * C * OH * OW * sizeof(float)) );
  }

  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaSetDevice(i) );
    CUDA_CALL( cudaDeviceSynchronize() );
  }
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

  for (int i = 0; i < num_devices; i++) {
    CUDA_CALL( cudaSetDevice(i) );

    CUDA_CALL( cudaFree(d_input[i]) );
    CUDA_CALL( cudaFree(d_filter[i]) );
    CUDA_CALL( cudaFree(d_output[i]) );
    CUDA_CALL( cudaFree(d_col[i]) );

    CUDA_CALL( cudaFreeHost(h_input[i]) );
    CUDA_CALL( cudaFreeHost(h_output[i]) );

    CUDA_CALL( cudaStreamDestroy(stream[i]) );
  }

}

__global__ void im2col_gpu_kernel(const int n, const float *data_im, 
    const int height, const int width, const int kernel_h, const int kernel_w,
    const int pad, const int stride, const int dilation, const int height_col,
    const int width_col, float* data_col) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
      index < n;
      index += blockDim.x * gridDim.x) {
    const int h_index = index / width_col;
    const int h_col = h_index % height_col;
    const int w_col = index % width_col;
    const int c_im = h_index / height_col;
    const int c_col = c_im * kernel_h * kernel_w;
    const int h_offset = h_col * stride - pad;
    const int w_offset = w_col * stride - pad;
    float *data_col_ptr = data_col;
    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
    const float *data_im_ptr = data_im;
    data_im_ptr += (c_im * height + h_offset) * width + w_offset;

    for (int i = 0; i < kernel_h; ++i) {
      for (int j = 0; j < kernel_w; ++j) {
        int h_im = h_offset + i * dilation;
        int w_im = w_offset + j * dilation;
        *data_col_ptr =
            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
            data_im_ptr[i * dilation * width + j * dilation] : 0;
        data_col_ptr += height_col * width_col;
      }
    }
  }
}

__global__ void sgemm_gpu_kernel(float *A, float *B, float *C,
    int M, int N, int K) {
  int bx = blockIdx.x;
  int by = blockIdx.y;

  int tx = threadIdx.x;
  int ty = threadIdx.y;

  int aBegin = K * SGEMM_BLOCKSIZE * by;
  int aEnd   = aBegin + K - 1;
  int aStep  = SGEMM_BLOCKSIZE;

  int bBegin = SGEMM_BLOCKSIZE * bx;
  int bStep  = SGEMM_BLOCKSIZE * N;

  float Csub = 0;

  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    __shared__ float As[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];
    __shared__ float Bs[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];

    As[ty][tx] = A[a + K * ty + tx];
    Bs[ty][tx] = B[b + N * ty + tx];

    __syncthreads();

#pragma unroll
    for (int k = 0; k < SGEMM_BLOCKSIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    __syncthreads();
  }

  if (bx * SGEMM_BLOCKSIZE + tx < N) {
    int c = N * SGEMM_BLOCKSIZE * by + SGEMM_BLOCKSIZE * bx;
    C[c + N * ty + tx] = Csub;
  }
}
. 2022-09-29 18:01:45 +09:00			`#include "convolution.h"`
			`#include <mpi.h>`
			`#include <stdio.h>`
			`#include <cuda_runtime.h>`
			`#include "util.h"`

			`#define CUDA_CALL(f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) { \`
			`fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \`
			`err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`

			`// definitions`
			`#define MAX_NODE_COUNT 2`
			`#define MAX_GPU_COUNT 4`
			`#define SGEMM_BLOCKSIZE 16`
			`#define IM2COL_NTHREADS 1024`

			`// MPI's per-node variables`
			`static int num_devices;`
			`static float input, output, *filter;`
			`static int N, C, H, W;`
			`static int K, R, S;`
			`static int OH, OW;`
			`static int pad, dilation, stride;`
			`static int mpi_rank, mpi_world_size;`
			`static int wbegin[MAX_NODE_COUNT], wend[MAX_NODE_COUNT];`
			`static int wsize[MAX_NODE_COUNT];`
			`static int mbegin[MAX_NODE_COUNT][MAX_GPU_COUNT];`
			`static int mend[MAX_NODE_COUNT][MAX_GPU_COUNT];`
			`static int msize[MAX_NODE_COUNT][MAX_GPU_COUNT];`
			`static int rounded_M, rounded_N;`

			`// GPU devices' variables`
			`static float *h_input[MAX_GPU_COUNT];`
			`static float *h_output[MAX_GPU_COUNT];`
			`static float *d_input[MAX_GPU_COUNT];`
			`static float *d_filter[MAX_GPU_COUNT];`
			`static float *d_col[MAX_GPU_COUNT];`
			`static float *d_output[MAX_GPU_COUNT];`

			`static cudaStream_t stream[MAX_GPU_COUNT];`

			`// function declaration`
			`__global__ void im2col_gpu_kernel(const int n, const float *data_im,`
			`const int height, const int width, const int kernel_h, const int kernel_w,`
			`const int pad, const int stride, const int dilation, const int height_col,`
			`const int width_col, float* data_col);`

			`__global__ void sgemm_gpu_kernel(float A, float B, float *C,`
			`int M, int N, int K);`

			`// static function`
			`static void convolution_gpu(void) {`
			`int n;`

			`int im2col_kernels = C * OH * OW;`
			`int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS;`

			`dim3 sgemm_blockDim(SGEMM_BLOCKSIZE, SGEMM_BLOCKSIZE);`
			`dim3 sgemm_gridDim(rounded_N/SGEMM_BLOCKSIZE, rounded_M/SGEMM_BLOCKSIZE);`

			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaMemcpy(d_filter[i], filter, K * C * R * S * sizeof(float),`
			`cudaMemcpyHostToDevice) );`
			`}`

			`for (n = 0; n < msize[mpi_rank][0]; n++) {`
			`for (int i = 0; i < num_devices; i++) {`
			`if (n >= msize[mpi_rank][i]) {`
			`continue;`
			`}`
			`CUDA_CALL( cudaSetDevice(i) );`
			`CUDA_CALL( cudaMemcpyAsync(d_input[i],`
			`h_input[i] + n * C * H * W,`
			`C * H * W * sizeof(float),`
			`cudaMemcpyHostToDevice, stream[i]) );`

			`im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[i]>>>(`
			`im2col_kernels,`
			`d_input[i],`
			`H, W, R, S,`
			`pad, stride, dilation, OH, OW, d_col[i]);`

			`sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[i]>>>(`
			`d_filter[i], d_col[i],`
			`d_output[i],`
			`K, OH * OW, R * S * C);`

			`CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW,`
			`d_output[i],`
			`K * OH * OW * sizeof(float),`
			`cudaMemcpyDeviceToHost, stream[i]) );`
			`}`
			`}`

			`int d = num_devices - 1;`
			`for (n = n; n < msize[mpi_rank][d]; n++) {`
			`CUDA_CALL( cudaSetDevice(d) );`
			`CUDA_CALL( cudaMemcpyAsync(d_input[d],`
			`h_input[d] + n * C * H * W,`
			`C * H * W * sizeof(float),`
			`cudaMemcpyHostToDevice, stream[d]) );`

			`im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[d]>>>(`
			`im2col_kernels,`
			`d_input[d],`
			`H, W, R, S,`
			`pad, stride, dilation, OH, OW, d_col[d]);`

			`sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[d]>>>(`
			`d_filter[d], d_col[d],`
			`d_output[d],`
			`K, OH * OW, R * S * C);`

			`CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW,`
			`d_output[d],`
			`K * OH * OW * sizeof(float),`
			`cudaMemcpyDeviceToHost, stream[d]) );`
			`}`

			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaSetDevice(i) );`
			`CUDA_CALL( cudaDeviceSynchronize() );`
			`}`
			`}`

			`void convolution(`
			`float _input, float _output, float *_filter,`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`
			`input = _input;`
			`output = _output;`
			`filter = _filter;`

			`MPI_Request req[MAX_NODE_COUNT][MAX_GPU_COUNT];`

			`if (mpi_rank != 0) {`
			`alloc_tensor(&filter, K, C, R, S);`
			`}`

			`// scattering`
			`if (mpi_rank == 0) {`
			`for (int i = 1; i < mpi_world_size; i++) {`
			`for (int j = 0; j < num_devices; j++) {`
			`MPI_Isend(input + (wbegin[i] + mbegin[i][j]) * C * H * W,`
			`msize[i][j] * C * H * W,`
			`MPI_FLOAT, i, 0, MPI_COMM_WORLD, &req[i - 1][j]);`
			`}`
			`}`

			`for (int i = 0; i < num_devices; i++) {`
			`memcpy(h_input[i],`
			`input + (wbegin[0] + mbegin[0][i]) * C * H * W,`
			`msize[0][i] * C * H * W * sizeof(float));`
			`}`

			`if (mpi_world_size > 1) {`
			`MPI_Waitall((mpi_world_size - 1) * num_devices,`
			`&req[0][0],`
			`MPI_STATUSES_IGNORE);`
			`}`
			`} else {`
			`for (int i = 0; i < num_devices; i++) {`
			`MPI_Irecv(h_input[i],`
			`msize[mpi_rank][i] * C * H * W * sizeof(float),`
			`MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &req[0][i]);`
			`}`

			`if (wsize[mpi_rank] > 0) {`
			`MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);`
			`}`
			`}`

			`// broadcasting`
			`if (mpi_world_size > 1) {`
			`MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);`
			`}`

			`// computation`
			`if (wsize[mpi_rank] > 0){`
			`convolution_gpu();`
			`}`

			`// gathering`
			`if (mpi_rank == 0) {`
			`for (int i = 1; i < mpi_world_size; i++) {`
			`for (int j = 0; j < num_devices; j++) {`
			`MPI_Irecv(output + (wbegin[i] + mbegin[i][j]) * K * OH * OW,`
			`msize[i][j] * K * OH * OW,`
			`MPI_FLOAT, i, 1, MPI_COMM_WORLD, &req[i - 1][j]);`
			`}`
			`}`

			`for (int i = 0; i < num_devices; i++) {`
			`memcpy(output + (wbegin[0] + mbegin[0][i]) * K * OH * OW,`
			`h_output[i],`
			`msize[0][i] * K * OH * OW * sizeof(float));`
			`}`

			`MPI_Waitall((mpi_world_size - 1) * num_devices,`
			`&req[0][0], MPI_STATUSES_IGNORE);`
			`} else {`
			`for (int i = 0; i < num_devices; i++) {`
			`MPI_Isend(h_output[i],`
			`msize[mpi_rank][i] * K * OH * OW,`
			`MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &req[0][i]);`
			`}`

			`MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);`
			`}`
			`}`

			`void convolution_init(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`
			`N = _N; C = _C; H = _H; W = _W;`
			`K = _K; R = _R; S = _S;`
			`pad = _pad;`
			`dilation = _dilation;`
			`stride = _stride;`

			`OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`rounded_M = (K + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;`
			`rounded_N = ((OH * OW) + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;`

			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`if (mpi_rank == 0) {`
			`printf("Using %d compute nodes\n", mpi_world_size);`
			`}`

			`CUDA_CALL( cudaGetDeviceCount(&num_devices) );`

			`printf("Node %d: Using %d devices\n", mpi_rank, num_devices);`

			`if (num_devices <= 0) {`
			`printf("No CUDA device. Aborting\n");`
			`exit(1);`
			`}`

			`// work distribution to nodes`
			`for (int i = 0; i < mpi_world_size; i++) {`
			`wbegin[i] = N / mpi_world_size * i;`
			`wend[i] = N / mpi_world_size * (i + 1);`
			`}`
			`wend[mpi_world_size - 1] = N;`

			`for (int i = 0; i < mpi_world_size; i++) {`
			`wsize[i] = wend[i] - wbegin[i];`
			`}`

			`// work distribution to devices`
			`for (int i = 0; i < mpi_world_size; i++) {`
			`for (int j = 0; j < num_devices; j++) {`
			`mbegin[i][j] = (wsize[i] / num_devices) * j;`
			`mend[i][j] = (wsize[i] / num_devices) * (j + 1);`
			`}`
			`mend[i][num_devices - 1] = wsize[i];`
			`}`

			`for (int i = 0; i < mpi_world_size; i++) {`
			`for (int j = 0; j < num_devices; j++) {`
			`msize[i][j] = mend[i][j] - mbegin[i][j];`
			`}`
			`}`

			`// memory allocation`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaSetDevice(i) );`

			`CUDA_CALL( cudaStreamCreate(&stream[i]) );`

			`CUDA_CALL( cudaMallocHost(&h_input[i], msize[mpi_rank][i] * C * H * W * sizeof(float)) );`
			`CUDA_CALL( cudaMallocHost(&h_output[i], msize[mpi_rank][i] * K * OH * OW * sizeof(float)) );`

			`CUDA_CALL( cudaMalloc(&d_input[i], C * H * W * sizeof(float)) );`
			`CUDA_CALL( cudaMalloc(&d_filter[i], rounded_M * C * R * S * sizeof(float)) );`
			`CUDA_CALL( cudaMalloc(&d_output[i], rounded_M * rounded_N * sizeof(float)) );`
			`CUDA_CALL( cudaMalloc(&d_col[i], R * S * C * OH * OW * sizeof(float)) );`

			`CUDA_CALL( cudaMemset(d_filter[i], 0, rounded_M * C * R * S * sizeof(float)) );`
			`CUDA_CALL( cudaMemset(d_output[i], 0, rounded_M * rounded_N * sizeof(float)) );`
			`CUDA_CALL( cudaMemset(d_col[i], 0, R * S * C * OH * OW * sizeof(float)) );`
			`}`

			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaSetDevice(i) );`
			`CUDA_CALL( cudaDeviceSynchronize() );`
			`}`
			`}`

			`void convolution_final(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`

			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaSetDevice(i) );`

			`CUDA_CALL( cudaFree(d_input[i]) );`
			`CUDA_CALL( cudaFree(d_filter[i]) );`
			`CUDA_CALL( cudaFree(d_output[i]) );`
			`CUDA_CALL( cudaFree(d_col[i]) );`

			`CUDA_CALL( cudaFreeHost(h_input[i]) );`
			`CUDA_CALL( cudaFreeHost(h_output[i]) );`

			`CUDA_CALL( cudaStreamDestroy(stream[i]) );`
			`}`

			`}`

			`__global__ void im2col_gpu_kernel(const int n, const float *data_im,`
			`const int height, const int width, const int kernel_h, const int kernel_w,`
			`const int pad, const int stride, const int dilation, const int height_col,`
			`const int width_col, float* data_col) {`
			`for (int index = blockIdx.x * blockDim.x + threadIdx.x;`
			`index < n;`
			`index += blockDim.x * gridDim.x) {`
			`const int h_index = index / width_col;`
			`const int h_col = h_index % height_col;`
			`const int w_col = index % width_col;`
			`const int c_im = h_index / height_col;`
			`const int c_col = c_im * kernel_h * kernel_w;`
			`const int h_offset = h_col * stride - pad;`
			`const int w_offset = w_col * stride - pad;`
			`float *data_col_ptr = data_col;`
			`data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;`
			`const float *data_im_ptr = data_im;`
			`data_im_ptr += (c_im * height + h_offset) * width + w_offset;`

			`for (int i = 0; i < kernel_h; ++i) {`
			`for (int j = 0; j < kernel_w; ++j) {`
			`int h_im = h_offset + i * dilation;`
			`int w_im = w_offset + j * dilation;`
			`*data_col_ptr =`
			`(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?`
			`data_im_ptr[i * dilation * width + j * dilation] : 0;`
			`data_col_ptr += height_col * width_col;`
			`}`
			`}`
			`}`
			`}`

			`__global__ void sgemm_gpu_kernel(float A, float B, float *C,`
			`int M, int N, int K) {`
			`int bx = blockIdx.x;`
			`int by = blockIdx.y;`

			`int tx = threadIdx.x;`
			`int ty = threadIdx.y;`

			`int aBegin = K * SGEMM_BLOCKSIZE * by;`
			`int aEnd = aBegin + K - 1;`
			`int aStep = SGEMM_BLOCKSIZE;`

			`int bBegin = SGEMM_BLOCKSIZE * bx;`
			`int bStep = SGEMM_BLOCKSIZE * N;`

			`float Csub = 0;`

			`for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {`
			`__shared__ float As[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];`
			`__shared__ float Bs[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];`

			`As[ty][tx] = A[a + K * ty + tx];`
			`Bs[ty][tx] = B[b + N * ty + tx];`

			`__syncthreads();`

			`#pragma unroll`
			`for (int k = 0; k < SGEMM_BLOCKSIZE; ++k) {`
			`Csub += As[ty][k] * Bs[k][tx];`
			`}`

			`__syncthreads();`
			`}`

			`if (bx * SGEMM_BLOCKSIZE + tx < N) {`
			`int c = N * SGEMM_BLOCKSIZE * by + SGEMM_BLOCKSIZE * bx;`
			`C[c + N * ty + tx] = Csub;`
			`}`
			`}`