chundoong-lab-ta/SamsungDS22/submissions/final/seong81.kim/B/convolution.cu

#include "convolution.h"
#include <mpi.h>
#include <stdio.h>
#include "util.h"
#include <immintrin.h>

#include <cstdio>
#include <cuda_runtime.h>

#define TS 32

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;
static int size[2];

#define CUDA_CALL(f)                                                           \
  {                                                                            \
    cudaError_t err = (f);                                                     \
    if (err != cudaSuccess) {                                                  \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
              err, cudaGetErrorString(err));                                   \
      exit(1);                                                                 \
    }                                                                          \
  }

#define MAX_NUM_GPU 4

int             num_devices = 0;

static float   *input__d[MAX_NUM_GPU];
static float   *filter_d[MAX_NUM_GPU];
static float   *output_d[MAX_NUM_GPU];
static int      N_str   [MAX_NUM_GPU];
static int      N_siz   [MAX_NUM_GPU];

void            cuda_init()           ;
void            cuda_final()          ;

//  Convolution Reference
//  for (int n = 0; n < N; ++n) {
//      for (int k = 0; k < K; ++k) {
//          for (int oh = 0; oh < OH; ++oh) {
//              for (int ow = 0; ow < OW; ++ow) {
//                  float o = 0.f;
//                  for (int c = 0; c < C; ++c) {
//                      for (int r = 0; r < R; ++r) {
//                          for (int s = 0; s < S; ++s) {
//                              int h = oh * stride - pad + r * dilation;
//                              int w = ow * stride - pad + s * dilation;
//                              if (h < 0 || h >= H || w < 0 || w >= W) continue;
//                              float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];
//                              float f = filter_d[k_ptr * C * R * S + c * R * S + r * S + s];
//                              o += i * f;
//                          } // s
//                      } // r
//                  } // c
//                  out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
//              } //ow
//          } // oh
//      } // k
//  } // n

//  CUDA Kernel 1 -- Naive Loop version
__global__ void kernel1(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) {
    const int   ow_thread   =   threadIdx.x                         ;
    const int   oh_thread   =   threadIdx.y                         ;
    const int   ow_block    =   blockIdx.x                          ;
    const int   oh_block    =   blockIdx.y                          ;

    const int   ow          =  (ow_block * blockDim.x) + ow_thread  ;
    const int   oh          =  (oh_block * blockDim.y) + oh_thread  ;

    const int   k_ptr       =   blockIdx.z % K                      ;
    const int   n_ptr       =   blockIdx.z / K                      ;

    if ((ow >= OW) || (oh >= OH)) return ;  //  Check Output Boundary

    float o = 0.f;
    for (int c = 0; c < C; ++c) {
        for (int r = 0; r < R; ++r) {
            for (int s = 0; s < S; ++s) {
                int h = oh * stride - pad + r * dilation;
                int w = ow * stride - pad + s * dilation;

                if (h < 0 || h >= H || w < 0 || w >= W) continue;   //  check Input boundary

                float i = in_d      [n_ptr * C * H * W + c * H * W + h * W + w];
                float f = filter_d  [k_ptr * C * R * S + c * R * S + r * S + s];

                o += i * f;
            } // s
        } // r
    } // c
    out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
}

//  CUDA Kernel 2 -- Indexing Calculation strength reduction
//                   Filter Loop Unrolling
__global__ void kernel2(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {

    const int   ow_thread   =   threadIdx.x                     ;
    const int   oh_thread   =   threadIdx.y                     ;
    const int   ow_block    =   blockIdx.x                      ;
    const int   oh_block    =   blockIdx.y                      ;

    const int   ow          =  (ow_block * blockDim.x) + ow_thread  ;
    const int   oh          =  (oh_block * blockDim.y) + oh_thread  ;

    const int   k_ptr       =   blockIdx.z % K                      ;
    const int   n_ptr       =   blockIdx.z / K                      ;

    const int   o_idx       =   n_ptr * K * OH * OW + k_ptr * OH * OW   ; 
    const int   i_base      =   n_ptr * C * H * W                       ;
    const int   f_base      =   k_ptr * C * R * S                       ;

    float       i[8]                        ;
    float       f[8]                        ;
    const int   s_start     =   S / 8 * 8   ;

    if ((ow >= OW) || (oh >= OH)) return ;

    float o = 0.f;
    for (int c = 0; c < C; ++c) {
        int i_base_c    =   i_base + c * H * W ;
        int f_base_c    =   f_base + c * R * S ;

        for (int r = 0; r < R ; r++ ){
            for (int s = 0; s < s_start; s += 8) {
                int h = oh + r;
                int w = ow + s;

                i[ 0]   =   in_d  [i_base_c + h * W + w +  0]  ;
                i[ 1]   =   in_d  [i_base_c + h * W + w +  1]  ;
                i[ 2]   =   in_d  [i_base_c + h * W + w +  2]  ;
                i[ 3]   =   in_d  [i_base_c + h * W + w +  3]  ;
                i[ 4]   =   in_d  [i_base_c + h * W + w +  4]  ;
                i[ 5]   =   in_d  [i_base_c + h * W + w +  5]  ;
                i[ 6]   =   in_d  [i_base_c + h * W + w +  6]  ;
                i[ 7]   =   in_d  [i_base_c + h * W + w +  7]  ;

                f[ 0]   =   filter_d[f_base_c + r * S + s +  0]    ;
                f[ 1]   =   filter_d[f_base_c + r * S + s +  1]    ;
                f[ 2]   =   filter_d[f_base_c + r * S + s +  2]    ;
                f[ 3]   =   filter_d[f_base_c + r * S + s +  3]    ;
                f[ 4]   =   filter_d[f_base_c + r * S + s +  4]    ;
                f[ 5]   =   filter_d[f_base_c + r * S + s +  5]    ;
                f[ 6]   =   filter_d[f_base_c + r * S + s +  6]    ;
                f[ 7]   =   filter_d[f_base_c + r * S + s +  7]    ;

                o       =   i[ 0] * f[ 0] +
                            i[ 1] * f[ 1] +
                            i[ 2] * f[ 2] +
                            i[ 3] * f[ 3] +
                            i[ 4] * f[ 4] +
                            i[ 5] * f[ 5] +
                            i[ 6] * f[ 6] +
                            i[ 7] * f[ 7] +
                            o             ;
            } // s
            for (int s = s_start; s < S ; ++s) {
                int h = oh + r;
                int w = ow + s;
                float i = in_d    [i_base_c + h * W + w];
                float f = filter_d[f_base_c + r * S + s];
                o += i * f;
            } // s
        } // r
    } // c
    out_d[o_idx + oh * OW + ow] = o;
}


//  CUDA Kernel 3 -- Filter Load usisng float4 vector
//                   Called when S is multiple of 4
//--    __global__ void kernel3(float *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
//--    
//--        const int   ow_thread   =   threadIdx.x                     ;
//--        const int   oh_thread   =   threadIdx.y                     ;
//--        const int   ow_block    =   blockIdx.x                      ;
//--        const int   oh_block    =   blockIdx.y                      ;
//--    
//--        const int   ow          =  (ow_block * blockDim.x) + ow_thread  ;
//--        const int   oh          =  (oh_block * blockDim.y) + oh_thread  ;
//--    
//--        const int   k_ptr       =   blockIdx.z % K                      ;
//--        const int   n_ptr       =   blockIdx.z / K                      ;
//--    
//--        const int   o_idx       =   n_ptr * K * OH * OW + k_ptr * OH * OW   ; 
//--        const int   i_base      =   n_ptr * C * H * W                       ;
//--        const int   f_base      =   k_ptr * C * R * S                       ;
//--    
//--        float4      i           ;
//--        float4      f           ;
//--        float4      vo          ;
//--    
//--        if ((ow >= OW) || (oh >= OH)) return ;
//--        float o = 0.f;
//--        for (int c = 0; c < C; ++c) {
//--            int i_base_c    =   i_base + c * H * W ;
//--            int f_base_c    =   f_base + c * R * S ;
//--    
//--            for (int r = 0; r < R ; r++ ){
//--                for (int s = 0; s < S ; s += 4) {
//--    
//--                    int h   =   oh + r;
//--                    int w   =   ow + s;
//--    
//--                    i.x     =   in_d  [i_base_c + h * W + w +  0]  ;
//--                    i.y     =   in_d  [i_base_c + h * W + w +  1]  ;
//--                    i.z     =   in_d  [i_base_c + h * W + w +  2]  ;
//--                    i.w     =   in_d  [i_base_c + h * W + w +  3]  ;
//--    
//--                    f       =   filter_d[(f_base_c + r * S + s)/4];
//--    
//--                    vo      =   make_float4(i.x * f.x, i.y * f.y, i.z * f.z , i.w * f.w);
//--    
//--                    o       =   vo.x + vo.y + vo.z + vo.w + o;
//--                } // s
//--            } // r
//--        } // c
//--        out_d[o_idx + oh * OW + ow] = o;
//--    }

//  CUDA Kernel 4 -- Enlarging Work for thread from 1 output to 4 outputs
//                   Input Image Load using float4 vector
//                   Filter Load using float4 vector
//                   Called when both of S and W are multiple of 4
__global__ void kernel4(float4 *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {

    const int   ow_thread   =   threadIdx.x                             ;
    const int   oh_thread   =   threadIdx.y                             ;
    const int   ow_block    =   blockIdx.x                              ;
    const int   oh_block    =   blockIdx.y                              ;

    const int   ow_base     =  (ow_block * blockDim.x  + ow_thread)* 4  ;
    const int   oh          =  (oh_block * blockDim.y) + oh_thread      ;

    const int   k_ptr       =   blockIdx.z % K                          ;
    const int   n_ptr       =   blockIdx.z / K                          ;

    const int   o_idx       =   n_ptr * K * OH * OW + k_ptr * OH * OW   ; 
    const int   i_base      =   n_ptr * C * H * W                       ;
    const int   f_base      =   k_ptr * C * R * S                       ;

    float4      i0          ;
    float4      i1          ;
    float       i[8]        ;
    float4      f           ;
    float4      vo [4]      ;

    if (oh >= OH) return ;

    float o[4] = {0.f, 0.f, 0.f, 0.f};

    for (int c = 0; c < C; ++c) {
        int i_base_c    =   i_base + c * H * W ;
        int f_base_c    =   f_base + c * R * S ;

        for (int r = 0; r < R ; r++ ){
            int h = oh + r;
            for (int s = 0; s < S ; s += 4) {
                f       =   filter_d[(f_base_c + r * S + s)/4];

                //  load input data to compute 4 OWs
                i0      =   in_d    [(i_base_c + h * W + ow_base + s    ) / 4];
                i1      =   in_d    [(i_base_c + h * W + ow_base + s + 4) / 4];

                i[0]    =   i0.x    ;
                i[1]    =   i0.y    ;
                i[2]    =   i0.z    ;
                i[3]    =   i0.w    ;
                i[4]    =   i1.x    ;
                i[5]    =   i1.y    ;
                i[6]    =   i1.z    ;
                i[7]    =   i1.w    ;

                vo[0]   =   make_float4(i[0] * f.x, i[1] * f.y, i[2] * f.z , i[3] * f.w);
                vo[1]   =   make_float4(i[1] * f.x, i[2] * f.y, i[3] * f.z , i[4] * f.w);
                vo[2]   =   make_float4(i[2] * f.x, i[3] * f.y, i[4] * f.z , i[5] * f.w);
                vo[3]   =   make_float4(i[3] * f.x, i[4] * f.y, i[5] * f.z , i[6] * f.w);

                o [0]   =   vo[0].x + vo[0].y + vo[0].z + vo[0].w + o[0];
                o [1]   =   vo[1].x + vo[1].y + vo[1].z + vo[1].w + o[1];
                o [2]   =   vo[2].x + vo[2].y + vo[2].z + vo[2].w + o[2];
                o [3]   =   vo[3].x + vo[3].y + vo[3].z + vo[3].w + o[3];
            } // s
        } // r
    } // c
    if (ow_base + 0 < OW)
        out_d[o_idx + oh * OW + ow_base + 0] = o[0];
    if (ow_base + 1 < OW)
        out_d[o_idx + oh * OW + ow_base + 1] = o[1];
    if (ow_base + 2 < OW)
        out_d[o_idx + oh * OW + ow_base + 2] = o[2];
    if (ow_base + 3 < OW)
        out_d[o_idx + oh * OW + ow_base + 3] = o[3];
}

void convolution(
    float *_input, float *_output, float *_filter,
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

    //  Assign static global variables
    input   = _input    ;
    output  = _output   ;
    filter  = _filter   ;

    MPI_Request     request;
    MPI_Status      status;

    if (mpi_rank == 0 && mpi_world_size == 2) {
        MPI_Isend(&input[size[0]*C*H*W], size[1]*C*H*W, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
        MPI_Isend( filter              , K*C*R*S      , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
    }
    else if (mpi_world_size == 2) {
        alloc_tensor(&input , size[1], C, H , W );
        alloc_tensor(&output, size[1], K, OH, OW);
        alloc_tensor(&filter, K      , C, R , S );
        MPI_Recv(input , size[1]*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
        MPI_Recv(filter, K*C*R*S      , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
    }

    //  Cuda Memory Copy Host to Device
    cuda_init();

    //  Grid Dimension
    int     grid_X_cnt  =  (OW + TS - 1) / TS   ;
    int     grid_Y_cnt  =  (OH + TS - 1) / TS   ;

    // Launch kernel on every GPU
    for (int i = 0; i < num_devices; i++) {

        CUDA_CALL( cudaSetDevice(i) );

        if ((pad == 0) && (dilation == 1) && (stride == 1)) {   //  No Multiply Operation to compute Array Index
            if ((S%4 == 0) && (W%4==0)) {   //  Using Cuda Vector type
                dim3 blockDim( TS / 4    , TS        ,  1           );    //  Max Thread = 1024
                dim3 gridDim ( grid_X_cnt, grid_Y_cnt,  N_siz[i] * K);
                kernel4<<<gridDim, blockDim>>>((float4*)input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
              //dim3 blockDim( TS        , TS        ,  1           );    //  Max Thread = 1024
              //dim3 gridDim ( grid_X_cnt, grid_Y_cnt,  N_siz[i] * K);
              //kernel3<<<gridDim, blockDim>>>(input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
            }
            else {  //  Scalar Memory Access 
                dim3 blockDim( TS        , TS        ,  1           );    //  Max Thread = 1024
                dim3 gridDim ( grid_X_cnt, grid_Y_cnt,  N_siz[i] * K);
                kernel2<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
            }
        } else {    //  Using multiply operation
                dim3 blockDim( TS        , TS        ,  1           );    //  Max Thread = 1024
                dim3 gridDim ( grid_X_cnt, grid_Y_cnt,  N_siz[i] * K);
                kernel1<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, pad, dilation, stride, OH, OW);
        }
      //cudaError_t cudaRunStatus = cudaGetLastError();
      //if (cudaRunStatus != cudaSuccess) {
      //    fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__ - 3, cudaRunStatus, cudaGetErrorString(cudaRunStatus));
      //    exit(1);
      //}  
    }

    // DO NOT REMOVE; NEEDED FOR TIME MEASURE
    for (int i = 0; i < num_devices; i++) {
        CUDA_CALL( cudaDeviceSynchronize() );
    }

    //  Cuda Mem Copy Device to Host
    cuda_final();

    //  MPI Result Gather
    if (mpi_rank == 0 && mpi_world_size == 2) {
        MPI_Recv(&output[size[0]*K*OH*OW], size[1]*K*OH*OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);
    }
    else if(mpi_world_size == 2){
        MPI_Isend(output, size[1]*K*OH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
    }
}

void cuda_init(){
    // Upload A and B matrix to every GPU
    for (int i = 0; i < num_devices; i++) {
      CUDA_CALL( cudaMemcpy(input__d[i], &input[N_str[i] * C * H * W], N_siz[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) );
      CUDA_CALL( cudaMemcpy(filter_d[i],  filter                     , K        * C * R * S * sizeof(float), cudaMemcpyHostToDevice) );
    }

    // DO NOT REMOVE; NEEDED FOR TIME MEASURE
    for (int i = 0; i < num_devices; i++) {
      CUDA_CALL( cudaDeviceSynchronize() );
    }
}

void cuda_final(){

    //  Download C matrix from GPUs
    for (int i = 0; i < num_devices; i++) {
        CUDA_CALL( cudaMemcpy(&output[N_str[i] * K * OH * OW], output_d[i], N_siz[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) );
    }

    //  DO NOT REMOVE; NEEDED FOR TIME MEASURE
    for (int i = 0; i < num_devices; i++) {
        CUDA_CALL( cudaDeviceSynchronize() );
    }
}

void convolution_init(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

    N = _N; C = _C; H = _H; W = _W;
    K = _K; R = _R; S = _S;
    pad = _pad;
    dilation = _dilation;
    stride = _stride;

    OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
    OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

    if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.4f);
    else                     size[1] = 0                        ;
    size[0] = N - size[1];

    //  Cuda Device Initialize
    CUDA_CALL( cudaGetDeviceCount(&num_devices) );

    //num_devices = 1;
    for (int i = 0; i < num_devices; i++) {
        cudaDeviceProp prop;
        CUDA_CALL( cudaGetDeviceProperties(&prop, i) );

      // Try printing more detailed information here
        printf("[GPU %d] %s\n", i, prop.name);
    }

    if (num_devices <= 0) {
        printf("No CUDA device found. Aborting\n");
        exit(1);
    }

    // Setup problem size for each GPU
    int mod     =   size[mpi_rank] % num_devices    ;

    for (int i = 0 ; i < num_devices ; i++){
        N_siz[i] = size[mpi_rank] / num_devices ;
    }
    for (int i = 0 ; i < mod ; i++){
        N_siz[i] ++ ;
    }

    N_str   [0] =   0                               ;
    for (int i = 1 ; i < num_devices ; i++){
        N_str[i] = N_str[i - 1] + N_siz[i - 1];
    }

    // Allocate device memory for each GPU
    for (int i = 0; i < num_devices; i++) {
      CUDA_CALL( cudaSetDevice(i) );
      CUDA_CALL( cudaMalloc(&input__d[i], N_siz[i] * C* H *W  * sizeof(float)) );
      CUDA_CALL( cudaMalloc(&filter_d[i], K        * C* R *S  * sizeof(float)) );
      CUDA_CALL( cudaMalloc(&output_d[i], N_siz[i] * K* OH*OW * sizeof(float)) );
    }
  //printf("\n\n Size[0] = %d, Size[1] = %d\n\n", size[0], size[1]);
}

void convolution_final(
    int _N, int _C, int _H, int _W,
    int _K, int _R, int _S,
    int _pad, int _dilation, int _stride) {

}
. 2022-09-29 18:01:45 +09:00			`#include "convolution.h"`
			`#include <mpi.h>`
			`#include <stdio.h>`
			`#include "util.h"`
			`#include <immintrin.h>`

			`#include <cstdio>`
			`#include <cuda_runtime.h>`

			`#define TS 32`

			`static float input, output, *filter;`
			`static int N, C, H, W;`
			`static int K, R, S;`
			`static int OH, OW;`
			`static int pad;`
			`static int dilation;`
			`static int stride;`
			`static int mpi_rank, mpi_world_size;`
			`static int size[2];`

			`#define CUDA_CALL(f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) { \`
			`fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \`
			`err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`

			`#define MAX_NUM_GPU 4`

			`int num_devices = 0;`

			`static float *input__d[MAX_NUM_GPU];`
			`static float *filter_d[MAX_NUM_GPU];`
			`static float *output_d[MAX_NUM_GPU];`
			`static int N_str [MAX_NUM_GPU];`
			`static int N_siz [MAX_NUM_GPU];`

			`void cuda_init() ;`
			`void cuda_final() ;`

			`// Convolution Reference`
			`// for (int n = 0; n < N; ++n) {`
			`// for (int k = 0; k < K; ++k) {`
			`// for (int oh = 0; oh < OH; ++oh) {`
			`// for (int ow = 0; ow < OW; ++ow) {`
			`// float o = 0.f;`
			`// for (int c = 0; c < C; ++c) {`
			`// for (int r = 0; r < R; ++r) {`
			`// for (int s = 0; s < S; ++s) {`
			`// int h = oh * stride - pad + r * dilation;`
			`// int w = ow * stride - pad + s * dilation;`
			`// if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W) continue;`
			`// float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];`
			`// float f = filter_d[k_ptr * C * R * S + c * R * S + r * S + s];`
			`// o += i * f;`
			`// } // s`
			`// } // r`
			`// } // c`
			`// out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;`
			`// } //ow`
			`// } // oh`
			`// } // k`
			`// } // n`

			`// CUDA Kernel 1 -- Naive Loop version`
			`__global__ void kernel1(float in_d, float out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) {`
			`const int ow_thread = threadIdx.x ;`
			`const int oh_thread = threadIdx.y ;`
			`const int ow_block = blockIdx.x ;`
			`const int oh_block = blockIdx.y ;`

			`const int ow = (ow_block * blockDim.x) + ow_thread ;`
			`const int oh = (oh_block * blockDim.y) + oh_thread ;`

			`const int k_ptr = blockIdx.z % K ;`
			`const int n_ptr = blockIdx.z / K ;`

			`if ((ow >= OW) \|\| (oh >= OH)) return ; // Check Output Boundary`

			`float o = 0.f;`
			`for (int c = 0; c < C; ++c) {`
			`for (int r = 0; r < R; ++r) {`
			`for (int s = 0; s < S; ++s) {`
			`int h = oh * stride - pad + r * dilation;`
			`int w = ow * stride - pad + s * dilation;`

			`if (h < 0 \|\| h >= H \|\| w < 0 \|\| w >= W) continue; // check Input boundary`

			`float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];`
			`float f = filter_d [k_ptr * C * R * S + c * R * S + r * S + s];`

			`o += i * f;`
			`} // s`
			`} // r`
			`} // c`
			`out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;`
			`}`

			`// CUDA Kernel 2 -- Indexing Calculation strength reduction`
			`// Filter Loop Unrolling`
			`__global__ void kernel2(float in_d, float out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {`

			`const int ow_thread = threadIdx.x ;`
			`const int oh_thread = threadIdx.y ;`
			`const int ow_block = blockIdx.x ;`
			`const int oh_block = blockIdx.y ;`

			`const int ow = (ow_block * blockDim.x) + ow_thread ;`
			`const int oh = (oh_block * blockDim.y) + oh_thread ;`

			`const int k_ptr = blockIdx.z % K ;`
			`const int n_ptr = blockIdx.z / K ;`

			`const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;`
			`const int i_base = n_ptr * C * H * W ;`
			`const int f_base = k_ptr * C * R * S ;`

			`float i[8] ;`
			`float f[8] ;`
			`const int s_start = S / 8 * 8 ;`

			`if ((ow >= OW) \|\| (oh >= OH)) return ;`

			`float o = 0.f;`
			`for (int c = 0; c < C; ++c) {`
			`int i_base_c = i_base + c * H * W ;`
			`int f_base_c = f_base + c * R * S ;`

			`for (int r = 0; r < R ; r++ ){`
			`for (int s = 0; s < s_start; s += 8) {`
			`int h = oh + r;`
			`int w = ow + s;`

			`i[ 0] = in_d [i_base_c + h * W + w + 0] ;`
			`i[ 1] = in_d [i_base_c + h * W + w + 1] ;`
			`i[ 2] = in_d [i_base_c + h * W + w + 2] ;`
			`i[ 3] = in_d [i_base_c + h * W + w + 3] ;`
			`i[ 4] = in_d [i_base_c + h * W + w + 4] ;`
			`i[ 5] = in_d [i_base_c + h * W + w + 5] ;`
			`i[ 6] = in_d [i_base_c + h * W + w + 6] ;`
			`i[ 7] = in_d [i_base_c + h * W + w + 7] ;`

			`f[ 0] = filter_d[f_base_c + r * S + s + 0] ;`
			`f[ 1] = filter_d[f_base_c + r * S + s + 1] ;`
			`f[ 2] = filter_d[f_base_c + r * S + s + 2] ;`
			`f[ 3] = filter_d[f_base_c + r * S + s + 3] ;`
			`f[ 4] = filter_d[f_base_c + r * S + s + 4] ;`
			`f[ 5] = filter_d[f_base_c + r * S + s + 5] ;`
			`f[ 6] = filter_d[f_base_c + r * S + s + 6] ;`
			`f[ 7] = filter_d[f_base_c + r * S + s + 7] ;`

			`o = i[ 0] * f[ 0] +`
			`i[ 1] * f[ 1] +`
			`i[ 2] * f[ 2] +`
			`i[ 3] * f[ 3] +`
			`i[ 4] * f[ 4] +`
			`i[ 5] * f[ 5] +`
			`i[ 6] * f[ 6] +`
			`i[ 7] * f[ 7] +`
			`o ;`
			`} // s`
			`for (int s = s_start; s < S ; ++s) {`
			`int h = oh + r;`
			`int w = ow + s;`
			`float i = in_d [i_base_c + h * W + w];`
			`float f = filter_d[f_base_c + r * S + s];`
			`o += i * f;`
			`} // s`
			`} // r`
			`} // c`
			`out_d[o_idx + oh * OW + ow] = o;`
			`}`


			`// CUDA Kernel 3 -- Filter Load usisng float4 vector`
			`// Called when S is multiple of 4`
			`//-- __global__ void kernel3(float in_d, float out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {`
			`//--`
			`//-- const int ow_thread = threadIdx.x ;`
			`//-- const int oh_thread = threadIdx.y ;`
			`//-- const int ow_block = blockIdx.x ;`
			`//-- const int oh_block = blockIdx.y ;`
			`//--`
			`//-- const int ow = (ow_block * blockDim.x) + ow_thread ;`
			`//-- const int oh = (oh_block * blockDim.y) + oh_thread ;`
			`//--`
			`//-- const int k_ptr = blockIdx.z % K ;`
			`//-- const int n_ptr = blockIdx.z / K ;`
			`//--`
			`//-- const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;`
			`//-- const int i_base = n_ptr * C * H * W ;`
			`//-- const int f_base = k_ptr * C * R * S ;`
			`//--`
			`//-- float4 i ;`
			`//-- float4 f ;`
			`//-- float4 vo ;`
			`//--`
			`//-- if ((ow >= OW) \|\| (oh >= OH)) return ;`
			`//-- float o = 0.f;`
			`//-- for (int c = 0; c < C; ++c) {`
			`//-- int i_base_c = i_base + c * H * W ;`
			`//-- int f_base_c = f_base + c * R * S ;`
			`//--`
			`//-- for (int r = 0; r < R ; r++ ){`
			`//-- for (int s = 0; s < S ; s += 4) {`
			`//--`
			`//-- int h = oh + r;`
			`//-- int w = ow + s;`
			`//--`
			`//-- i.x = in_d [i_base_c + h * W + w + 0] ;`
			`//-- i.y = in_d [i_base_c + h * W + w + 1] ;`
			`//-- i.z = in_d [i_base_c + h * W + w + 2] ;`
			`//-- i.w = in_d [i_base_c + h * W + w + 3] ;`
			`//--`
			`//-- f = filter_d[(f_base_c + r * S + s)/4];`
			`//--`
			`//-- vo = make_float4(i.x * f.x, i.y * f.y, i.z * f.z , i.w * f.w);`
			`//--`
			`//-- o = vo.x + vo.y + vo.z + vo.w + o;`
			`//-- } // s`
			`//-- } // r`
			`//-- } // c`
			`//-- out_d[o_idx + oh * OW + ow] = o;`
			`//-- }`

			`// CUDA Kernel 4 -- Enlarging Work for thread from 1 output to 4 outputs`
			`// Input Image Load using float4 vector`
			`// Filter Load using float4 vector`
			`// Called when both of S and W are multiple of 4`
			`__global__ void kernel4(float4 in_d, float out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {`

			`const int ow_thread = threadIdx.x ;`
			`const int oh_thread = threadIdx.y ;`
			`const int ow_block = blockIdx.x ;`
			`const int oh_block = blockIdx.y ;`

			`const int ow_base = (ow_block * blockDim.x + ow_thread)* 4 ;`
			`const int oh = (oh_block * blockDim.y) + oh_thread ;`

			`const int k_ptr = blockIdx.z % K ;`
			`const int n_ptr = blockIdx.z / K ;`

			`const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;`
			`const int i_base = n_ptr * C * H * W ;`
			`const int f_base = k_ptr * C * R * S ;`

			`float4 i0 ;`
			`float4 i1 ;`
			`float i[8] ;`
			`float4 f ;`
			`float4 vo [4] ;`

			`if (oh >= OH) return ;`

			`float o[4] = {0.f, 0.f, 0.f, 0.f};`

			`for (int c = 0; c < C; ++c) {`
			`int i_base_c = i_base + c * H * W ;`
			`int f_base_c = f_base + c * R * S ;`

			`for (int r = 0; r < R ; r++ ){`
			`int h = oh + r;`
			`for (int s = 0; s < S ; s += 4) {`
			`f = filter_d[(f_base_c + r * S + s)/4];`

			`// load input data to compute 4 OWs`
			`i0 = in_d [(i_base_c + h * W + ow_base + s ) / 4];`
			`i1 = in_d [(i_base_c + h * W + ow_base + s + 4) / 4];`

			`i[0] = i0.x ;`
			`i[1] = i0.y ;`
			`i[2] = i0.z ;`
			`i[3] = i0.w ;`
			`i[4] = i1.x ;`
			`i[5] = i1.y ;`
			`i[6] = i1.z ;`
			`i[7] = i1.w ;`

			`vo[0] = make_float4(i[0] * f.x, i[1] * f.y, i[2] * f.z , i[3] * f.w);`
			`vo[1] = make_float4(i[1] * f.x, i[2] * f.y, i[3] * f.z , i[4] * f.w);`
			`vo[2] = make_float4(i[2] * f.x, i[3] * f.y, i[4] * f.z , i[5] * f.w);`
			`vo[3] = make_float4(i[3] * f.x, i[4] * f.y, i[5] * f.z , i[6] * f.w);`

			`o [0] = vo[0].x + vo[0].y + vo[0].z + vo[0].w + o[0];`
			`o [1] = vo[1].x + vo[1].y + vo[1].z + vo[1].w + o[1];`
			`o [2] = vo[2].x + vo[2].y + vo[2].z + vo[2].w + o[2];`
			`o [3] = vo[3].x + vo[3].y + vo[3].z + vo[3].w + o[3];`
			`} // s`
			`} // r`
			`} // c`
			`if (ow_base + 0 < OW)`
			`out_d[o_idx + oh * OW + ow_base + 0] = o[0];`
			`if (ow_base + 1 < OW)`
			`out_d[o_idx + oh * OW + ow_base + 1] = o[1];`
			`if (ow_base + 2 < OW)`
			`out_d[o_idx + oh * OW + ow_base + 2] = o[2];`
			`if (ow_base + 3 < OW)`
			`out_d[o_idx + oh * OW + ow_base + 3] = o[3];`
			`}`

			`void convolution(`
			`float _input, float _output, float *_filter,`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`

			`// Assign static global variables`
			`input = _input ;`
			`output = _output ;`
			`filter = _filter ;`

			`MPI_Request request;`
			`MPI_Status status;`

			`if (mpi_rank == 0 && mpi_world_size == 2) {`
			`MPI_Isend(&input[size[0]CHW], size[1]CHW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);`
			`MPI_Isend( filter , KCR*S , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);`
			`}`
			`else if (mpi_world_size == 2) {`
			`alloc_tensor(&input , size[1], C, H , W );`
			`alloc_tensor(&output, size[1], K, OH, OW);`
			`alloc_tensor(&filter, K , C, R , S );`
			`MPI_Recv(input , size[1]CH*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);`
			`MPI_Recv(filter, KCR*S , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);`
			`}`

			`// Cuda Memory Copy Host to Device`
			`cuda_init();`

			`// Grid Dimension`
			`int grid_X_cnt = (OW + TS - 1) / TS ;`
			`int grid_Y_cnt = (OH + TS - 1) / TS ;`

			`// Launch kernel on every GPU`
			`for (int i = 0; i < num_devices; i++) {`

			`CUDA_CALL( cudaSetDevice(i) );`

			`if ((pad == 0) && (dilation == 1) && (stride == 1)) { // No Multiply Operation to compute Array Index`
			`if ((S%4 == 0) && (W%4==0)) { // Using Cuda Vector type`
			`dim3 blockDim( TS / 4 , TS , 1 ); // Max Thread = 1024`
			`dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);`
			`kernel4<<<gridDim, blockDim>>>((float4)input__d[i], output_d[i], (float4 )filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);`
			`//dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024`
			`//dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);`
			`//kernel3<<<gridDim, blockDim>>>(input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);`
			`}`
			`else { // Scalar Memory Access`
			`dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024`
			`dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);`
			`kernel2<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);`
			`}`
			`} else { // Using multiply operation`
			`dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024`
			`dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);`
			`kernel1<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, pad, dilation, stride, OH, OW);`
			`}`
			`//cudaError_t cudaRunStatus = cudaGetLastError();`
			`//if (cudaRunStatus != cudaSuccess) {`
			`// fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__ - 3, cudaRunStatus, cudaGetErrorString(cudaRunStatus));`
			`// exit(1);`
			`//}`
			`}`

			`// DO NOT REMOVE; NEEDED FOR TIME MEASURE`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaDeviceSynchronize() );`
			`}`

			`// Cuda Mem Copy Device to Host`
			`cuda_final();`

			`// MPI Result Gather`
			`if (mpi_rank == 0 && mpi_world_size == 2) {`
			`MPI_Recv(&output[size[0]KOHOW], size[1]KOHOW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);`
			`}`
			`else if(mpi_world_size == 2){`
			`MPI_Isend(output, size[1]KOH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);`
			`}`
			`}`

			`void cuda_init(){`
			`// Upload A and B matrix to every GPU`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaMemcpy(input__d[i], &input[N_str[i] * C * H * W], N_siz[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) );`
			`CUDA_CALL( cudaMemcpy(filter_d[i], filter , K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) );`
			`}`

			`// DO NOT REMOVE; NEEDED FOR TIME MEASURE`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaDeviceSynchronize() );`
			`}`
			`}`

			`void cuda_final(){`

			`// Download C matrix from GPUs`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaMemcpy(&output[N_str[i] * K * OH * OW], output_d[i], N_siz[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) );`
			`}`

			`// DO NOT REMOVE; NEEDED FOR TIME MEASURE`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaDeviceSynchronize() );`
			`}`
			`}`

			`void convolution_init(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`

			`N = _N; C = _C; H = _H; W = _W;`
			`K = _K; R = _R; S = _S;`
			`pad = _pad;`
			`dilation = _dilation;`
			`stride = _stride;`

			`OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;`
			`OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;`

			`MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);`
			`MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);`

			`if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.4f);`
			`else size[1] = 0 ;`
			`size[0] = N - size[1];`

			`// Cuda Device Initialize`
			`CUDA_CALL( cudaGetDeviceCount(&num_devices) );`

			`//num_devices = 1;`
			`for (int i = 0; i < num_devices; i++) {`
			`cudaDeviceProp prop;`
			`CUDA_CALL( cudaGetDeviceProperties(&prop, i) );`

			`// Try printing more detailed information here`
			`printf("[GPU %d] %s\n", i, prop.name);`
			`}`

			`if (num_devices <= 0) {`
			`printf("No CUDA device found. Aborting\n");`
			`exit(1);`
			`}`

			`// Setup problem size for each GPU`
			`int mod = size[mpi_rank] % num_devices ;`

			`for (int i = 0 ; i < num_devices ; i++){`
			`N_siz[i] = size[mpi_rank] / num_devices ;`
			`}`
			`for (int i = 0 ; i < mod ; i++){`
			`N_siz[i] ++ ;`
			`}`

			`N_str [0] = 0 ;`
			`for (int i = 1 ; i < num_devices ; i++){`
			`N_str[i] = N_str[i - 1] + N_siz[i - 1];`
			`}`

			`// Allocate device memory for each GPU`
			`for (int i = 0; i < num_devices; i++) {`
			`CUDA_CALL( cudaSetDevice(i) );`
			`CUDA_CALL( cudaMalloc(&input__d[i], N_siz[i] * C* H W sizeof(float)) );`
			`CUDA_CALL( cudaMalloc(&filter_d[i], K * C* R S sizeof(float)) );`
			`CUDA_CALL( cudaMalloc(&output_d[i], N_siz[i] * K* OHOW sizeof(float)) );`
			`}`
			`//printf("\n\n Size[0] = %d, Size[1] = %d\n\n", size[0], size[1]);`
			`}`

			`void convolution_final(`
			`int _N, int _C, int _H, int _W,`
			`int _K, int _R, int _S,`
			`int _pad, int _dilation, int _stride) {`

			`}`