481 lines
19 KiB
Plaintext
481 lines
19 KiB
Plaintext
#include "convolution.h"
|
|
#include <mpi.h>
|
|
#include <stdio.h>
|
|
#include "util.h"
|
|
#include <immintrin.h>
|
|
|
|
#include <cstdio>
|
|
#include <cuda_runtime.h>
|
|
|
|
#define TS 32
|
|
|
|
static float *input, *output, *filter;
|
|
static int N, C, H, W;
|
|
static int K, R, S;
|
|
static int OH, OW;
|
|
static int pad;
|
|
static int dilation;
|
|
static int stride;
|
|
static int mpi_rank, mpi_world_size;
|
|
static int size[2];
|
|
|
|
#define CUDA_CALL(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
#define MAX_NUM_GPU 4
|
|
|
|
int num_devices = 0;
|
|
|
|
static float *input__d[MAX_NUM_GPU];
|
|
static float *filter_d[MAX_NUM_GPU];
|
|
static float *output_d[MAX_NUM_GPU];
|
|
static int N_str [MAX_NUM_GPU];
|
|
static int N_siz [MAX_NUM_GPU];
|
|
|
|
void cuda_init() ;
|
|
void cuda_final() ;
|
|
|
|
// Convolution Reference
|
|
// for (int n = 0; n < N; ++n) {
|
|
// for (int k = 0; k < K; ++k) {
|
|
// for (int oh = 0; oh < OH; ++oh) {
|
|
// for (int ow = 0; ow < OW; ++ow) {
|
|
// float o = 0.f;
|
|
// for (int c = 0; c < C; ++c) {
|
|
// for (int r = 0; r < R; ++r) {
|
|
// for (int s = 0; s < S; ++s) {
|
|
// int h = oh * stride - pad + r * dilation;
|
|
// int w = ow * stride - pad + s * dilation;
|
|
// if (h < 0 || h >= H || w < 0 || w >= W) continue;
|
|
// float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];
|
|
// float f = filter_d[k_ptr * C * R * S + c * R * S + r * S + s];
|
|
// o += i * f;
|
|
// } // s
|
|
// } // r
|
|
// } // c
|
|
// out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
|
|
// } //ow
|
|
// } // oh
|
|
// } // k
|
|
// } // n
|
|
|
|
// CUDA Kernel 1 -- Naive Loop version
|
|
__global__ void kernel1(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int pad, int dilation, int stride, int OH, int OW) {
|
|
const int ow_thread = threadIdx.x ;
|
|
const int oh_thread = threadIdx.y ;
|
|
const int ow_block = blockIdx.x ;
|
|
const int oh_block = blockIdx.y ;
|
|
|
|
const int ow = (ow_block * blockDim.x) + ow_thread ;
|
|
const int oh = (oh_block * blockDim.y) + oh_thread ;
|
|
|
|
const int k_ptr = blockIdx.z % K ;
|
|
const int n_ptr = blockIdx.z / K ;
|
|
|
|
if ((ow >= OW) || (oh >= OH)) return ; // Check Output Boundary
|
|
|
|
float o = 0.f;
|
|
for (int c = 0; c < C; ++c) {
|
|
for (int r = 0; r < R; ++r) {
|
|
for (int s = 0; s < S; ++s) {
|
|
int h = oh * stride - pad + r * dilation;
|
|
int w = ow * stride - pad + s * dilation;
|
|
|
|
if (h < 0 || h >= H || w < 0 || w >= W) continue; // check Input boundary
|
|
|
|
float i = in_d [n_ptr * C * H * W + c * H * W + h * W + w];
|
|
float f = filter_d [k_ptr * C * R * S + c * R * S + r * S + s];
|
|
|
|
o += i * f;
|
|
} // s
|
|
} // r
|
|
} // c
|
|
out_d[n_ptr * K * OH * OW + k_ptr * OH * OW + oh * OW + ow] = o;
|
|
}
|
|
|
|
// CUDA Kernel 2 -- Indexing Calculation strength reduction
|
|
// Filter Loop Unrolling
|
|
__global__ void kernel2(float *in_d, float *out_d, float *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
|
|
|
|
const int ow_thread = threadIdx.x ;
|
|
const int oh_thread = threadIdx.y ;
|
|
const int ow_block = blockIdx.x ;
|
|
const int oh_block = blockIdx.y ;
|
|
|
|
const int ow = (ow_block * blockDim.x) + ow_thread ;
|
|
const int oh = (oh_block * blockDim.y) + oh_thread ;
|
|
|
|
const int k_ptr = blockIdx.z % K ;
|
|
const int n_ptr = blockIdx.z / K ;
|
|
|
|
const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
|
|
const int i_base = n_ptr * C * H * W ;
|
|
const int f_base = k_ptr * C * R * S ;
|
|
|
|
float i[8] ;
|
|
float f[8] ;
|
|
const int s_start = S / 8 * 8 ;
|
|
|
|
if ((ow >= OW) || (oh >= OH)) return ;
|
|
|
|
float o = 0.f;
|
|
for (int c = 0; c < C; ++c) {
|
|
int i_base_c = i_base + c * H * W ;
|
|
int f_base_c = f_base + c * R * S ;
|
|
|
|
for (int r = 0; r < R ; r++ ){
|
|
for (int s = 0; s < s_start; s += 8) {
|
|
int h = oh + r;
|
|
int w = ow + s;
|
|
|
|
i[ 0] = in_d [i_base_c + h * W + w + 0] ;
|
|
i[ 1] = in_d [i_base_c + h * W + w + 1] ;
|
|
i[ 2] = in_d [i_base_c + h * W + w + 2] ;
|
|
i[ 3] = in_d [i_base_c + h * W + w + 3] ;
|
|
i[ 4] = in_d [i_base_c + h * W + w + 4] ;
|
|
i[ 5] = in_d [i_base_c + h * W + w + 5] ;
|
|
i[ 6] = in_d [i_base_c + h * W + w + 6] ;
|
|
i[ 7] = in_d [i_base_c + h * W + w + 7] ;
|
|
|
|
f[ 0] = filter_d[f_base_c + r * S + s + 0] ;
|
|
f[ 1] = filter_d[f_base_c + r * S + s + 1] ;
|
|
f[ 2] = filter_d[f_base_c + r * S + s + 2] ;
|
|
f[ 3] = filter_d[f_base_c + r * S + s + 3] ;
|
|
f[ 4] = filter_d[f_base_c + r * S + s + 4] ;
|
|
f[ 5] = filter_d[f_base_c + r * S + s + 5] ;
|
|
f[ 6] = filter_d[f_base_c + r * S + s + 6] ;
|
|
f[ 7] = filter_d[f_base_c + r * S + s + 7] ;
|
|
|
|
o = i[ 0] * f[ 0] +
|
|
i[ 1] * f[ 1] +
|
|
i[ 2] * f[ 2] +
|
|
i[ 3] * f[ 3] +
|
|
i[ 4] * f[ 4] +
|
|
i[ 5] * f[ 5] +
|
|
i[ 6] * f[ 6] +
|
|
i[ 7] * f[ 7] +
|
|
o ;
|
|
} // s
|
|
for (int s = s_start; s < S ; ++s) {
|
|
int h = oh + r;
|
|
int w = ow + s;
|
|
float i = in_d [i_base_c + h * W + w];
|
|
float f = filter_d[f_base_c + r * S + s];
|
|
o += i * f;
|
|
} // s
|
|
} // r
|
|
} // c
|
|
out_d[o_idx + oh * OW + ow] = o;
|
|
}
|
|
|
|
|
|
// CUDA Kernel 3 -- Filter Load usisng float4 vector
|
|
// Called when S is multiple of 4
|
|
//-- __global__ void kernel3(float *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
|
|
//--
|
|
//-- const int ow_thread = threadIdx.x ;
|
|
//-- const int oh_thread = threadIdx.y ;
|
|
//-- const int ow_block = blockIdx.x ;
|
|
//-- const int oh_block = blockIdx.y ;
|
|
//--
|
|
//-- const int ow = (ow_block * blockDim.x) + ow_thread ;
|
|
//-- const int oh = (oh_block * blockDim.y) + oh_thread ;
|
|
//--
|
|
//-- const int k_ptr = blockIdx.z % K ;
|
|
//-- const int n_ptr = blockIdx.z / K ;
|
|
//--
|
|
//-- const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
|
|
//-- const int i_base = n_ptr * C * H * W ;
|
|
//-- const int f_base = k_ptr * C * R * S ;
|
|
//--
|
|
//-- float4 i ;
|
|
//-- float4 f ;
|
|
//-- float4 vo ;
|
|
//--
|
|
//-- if ((ow >= OW) || (oh >= OH)) return ;
|
|
//-- float o = 0.f;
|
|
//-- for (int c = 0; c < C; ++c) {
|
|
//-- int i_base_c = i_base + c * H * W ;
|
|
//-- int f_base_c = f_base + c * R * S ;
|
|
//--
|
|
//-- for (int r = 0; r < R ; r++ ){
|
|
//-- for (int s = 0; s < S ; s += 4) {
|
|
//--
|
|
//-- int h = oh + r;
|
|
//-- int w = ow + s;
|
|
//--
|
|
//-- i.x = in_d [i_base_c + h * W + w + 0] ;
|
|
//-- i.y = in_d [i_base_c + h * W + w + 1] ;
|
|
//-- i.z = in_d [i_base_c + h * W + w + 2] ;
|
|
//-- i.w = in_d [i_base_c + h * W + w + 3] ;
|
|
//--
|
|
//-- f = filter_d[(f_base_c + r * S + s)/4];
|
|
//--
|
|
//-- vo = make_float4(i.x * f.x, i.y * f.y, i.z * f.z , i.w * f.w);
|
|
//--
|
|
//-- o = vo.x + vo.y + vo.z + vo.w + o;
|
|
//-- } // s
|
|
//-- } // r
|
|
//-- } // c
|
|
//-- out_d[o_idx + oh * OW + ow] = o;
|
|
//-- }
|
|
|
|
// CUDA Kernel 4 -- Enlarging Work for thread from 1 output to 4 outputs
|
|
// Input Image Load using float4 vector
|
|
// Filter Load using float4 vector
|
|
// Called when both of S and W are multiple of 4
|
|
__global__ void kernel4(float4 *in_d, float *out_d, float4 *filter_d, int N, int C, int H, int W, int K, int R, int S, int OH, int OW) {
|
|
|
|
const int ow_thread = threadIdx.x ;
|
|
const int oh_thread = threadIdx.y ;
|
|
const int ow_block = blockIdx.x ;
|
|
const int oh_block = blockIdx.y ;
|
|
|
|
const int ow_base = (ow_block * blockDim.x + ow_thread)* 4 ;
|
|
const int oh = (oh_block * blockDim.y) + oh_thread ;
|
|
|
|
const int k_ptr = blockIdx.z % K ;
|
|
const int n_ptr = blockIdx.z / K ;
|
|
|
|
const int o_idx = n_ptr * K * OH * OW + k_ptr * OH * OW ;
|
|
const int i_base = n_ptr * C * H * W ;
|
|
const int f_base = k_ptr * C * R * S ;
|
|
|
|
float4 i0 ;
|
|
float4 i1 ;
|
|
float i[8] ;
|
|
float4 f ;
|
|
float4 vo [4] ;
|
|
|
|
if (oh >= OH) return ;
|
|
|
|
float o[4] = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
for (int c = 0; c < C; ++c) {
|
|
int i_base_c = i_base + c * H * W ;
|
|
int f_base_c = f_base + c * R * S ;
|
|
|
|
for (int r = 0; r < R ; r++ ){
|
|
int h = oh + r;
|
|
for (int s = 0; s < S ; s += 4) {
|
|
f = filter_d[(f_base_c + r * S + s)/4];
|
|
|
|
// load input data to compute 4 OWs
|
|
i0 = in_d [(i_base_c + h * W + ow_base + s ) / 4];
|
|
i1 = in_d [(i_base_c + h * W + ow_base + s + 4) / 4];
|
|
|
|
i[0] = i0.x ;
|
|
i[1] = i0.y ;
|
|
i[2] = i0.z ;
|
|
i[3] = i0.w ;
|
|
i[4] = i1.x ;
|
|
i[5] = i1.y ;
|
|
i[6] = i1.z ;
|
|
i[7] = i1.w ;
|
|
|
|
vo[0] = make_float4(i[0] * f.x, i[1] * f.y, i[2] * f.z , i[3] * f.w);
|
|
vo[1] = make_float4(i[1] * f.x, i[2] * f.y, i[3] * f.z , i[4] * f.w);
|
|
vo[2] = make_float4(i[2] * f.x, i[3] * f.y, i[4] * f.z , i[5] * f.w);
|
|
vo[3] = make_float4(i[3] * f.x, i[4] * f.y, i[5] * f.z , i[6] * f.w);
|
|
|
|
o [0] = vo[0].x + vo[0].y + vo[0].z + vo[0].w + o[0];
|
|
o [1] = vo[1].x + vo[1].y + vo[1].z + vo[1].w + o[1];
|
|
o [2] = vo[2].x + vo[2].y + vo[2].z + vo[2].w + o[2];
|
|
o [3] = vo[3].x + vo[3].y + vo[3].z + vo[3].w + o[3];
|
|
} // s
|
|
} // r
|
|
} // c
|
|
if (ow_base + 0 < OW)
|
|
out_d[o_idx + oh * OW + ow_base + 0] = o[0];
|
|
if (ow_base + 1 < OW)
|
|
out_d[o_idx + oh * OW + ow_base + 1] = o[1];
|
|
if (ow_base + 2 < OW)
|
|
out_d[o_idx + oh * OW + ow_base + 2] = o[2];
|
|
if (ow_base + 3 < OW)
|
|
out_d[o_idx + oh * OW + ow_base + 3] = o[3];
|
|
}
|
|
|
|
void convolution(
|
|
float *_input, float *_output, float *_filter,
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
|
|
// Assign static global variables
|
|
input = _input ;
|
|
output = _output ;
|
|
filter = _filter ;
|
|
|
|
MPI_Request request;
|
|
MPI_Status status;
|
|
|
|
if (mpi_rank == 0 && mpi_world_size == 2) {
|
|
MPI_Isend(&input[size[0]*C*H*W], size[1]*C*H*W, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
|
|
MPI_Isend( filter , K*C*R*S , MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &request);
|
|
}
|
|
else if (mpi_world_size == 2) {
|
|
alloc_tensor(&input , size[1], C, H , W );
|
|
alloc_tensor(&output, size[1], K, OH, OW);
|
|
alloc_tensor(&filter, K , C, R , S );
|
|
MPI_Recv(input , size[1]*C*H*W, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
|
|
MPI_Recv(filter, K*C*R*S , MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status);
|
|
}
|
|
|
|
// Cuda Memory Copy Host to Device
|
|
cuda_init();
|
|
|
|
// Grid Dimension
|
|
int grid_X_cnt = (OW + TS - 1) / TS ;
|
|
int grid_Y_cnt = (OH + TS - 1) / TS ;
|
|
|
|
// Launch kernel on every GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
|
|
if ((pad == 0) && (dilation == 1) && (stride == 1)) { // No Multiply Operation to compute Array Index
|
|
if ((S%4 == 0) && (W%4==0)) { // Using Cuda Vector type
|
|
dim3 blockDim( TS / 4 , TS , 1 ); // Max Thread = 1024
|
|
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
|
|
kernel4<<<gridDim, blockDim>>>((float4*)input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
|
|
//dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
|
|
//dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
|
|
//kernel3<<<gridDim, blockDim>>>(input__d[i], output_d[i], (float4 *)filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
|
|
}
|
|
else { // Scalar Memory Access
|
|
dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
|
|
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
|
|
kernel2<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, OH, OW);
|
|
}
|
|
} else { // Using multiply operation
|
|
dim3 blockDim( TS , TS , 1 ); // Max Thread = 1024
|
|
dim3 gridDim ( grid_X_cnt, grid_Y_cnt, N_siz[i] * K);
|
|
kernel1<<<gridDim, blockDim>>>(input__d[i], output_d[i], filter_d[i], N_siz[i], C, H ,W, K, R, S, pad, dilation, stride, OH, OW);
|
|
}
|
|
//cudaError_t cudaRunStatus = cudaGetLastError();
|
|
//if (cudaRunStatus != cudaSuccess) {
|
|
// fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__ - 3, cudaRunStatus, cudaGetErrorString(cudaRunStatus));
|
|
// exit(1);
|
|
//}
|
|
}
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
|
|
// Cuda Mem Copy Device to Host
|
|
cuda_final();
|
|
|
|
// MPI Result Gather
|
|
if (mpi_rank == 0 && mpi_world_size == 2) {
|
|
MPI_Recv(&output[size[0]*K*OH*OW], size[1]*K*OH*OW, MPI_FLOAT, 1, 0, MPI_COMM_WORLD, &status);
|
|
}
|
|
else if(mpi_world_size == 2){
|
|
MPI_Isend(output, size[1]*K*OH*OW, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &request);
|
|
}
|
|
}
|
|
|
|
void cuda_init(){
|
|
// Upload A and B matrix to every GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaMemcpy(input__d[i], &input[N_str[i] * C * H * W], N_siz[i] * C * H * W * sizeof(float), cudaMemcpyHostToDevice) );
|
|
CUDA_CALL( cudaMemcpy(filter_d[i], filter , K * C * R * S * sizeof(float), cudaMemcpyHostToDevice) );
|
|
}
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
void cuda_final(){
|
|
|
|
// Download C matrix from GPUs
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaMemcpy(&output[N_str[i] * K * OH * OW], output_d[i], N_siz[i] * K * OH * OW * sizeof(float), cudaMemcpyDeviceToHost) );
|
|
}
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
void convolution_init(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
|
|
N = _N; C = _C; H = _H; W = _W;
|
|
K = _K; R = _R; S = _S;
|
|
pad = _pad;
|
|
dilation = _dilation;
|
|
stride = _stride;
|
|
|
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
|
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
|
|
|
if (mpi_world_size == 2) size[1] = (int) ( (float)_N * 0.4f);
|
|
else size[1] = 0 ;
|
|
size[0] = N - size[1];
|
|
|
|
// Cuda Device Initialize
|
|
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
|
|
|
|
//num_devices = 1;
|
|
for (int i = 0; i < num_devices; i++) {
|
|
cudaDeviceProp prop;
|
|
CUDA_CALL( cudaGetDeviceProperties(&prop, i) );
|
|
|
|
// Try printing more detailed information here
|
|
printf("[GPU %d] %s\n", i, prop.name);
|
|
}
|
|
|
|
if (num_devices <= 0) {
|
|
printf("No CUDA device found. Aborting\n");
|
|
exit(1);
|
|
}
|
|
|
|
// Setup problem size for each GPU
|
|
int mod = size[mpi_rank] % num_devices ;
|
|
|
|
for (int i = 0 ; i < num_devices ; i++){
|
|
N_siz[i] = size[mpi_rank] / num_devices ;
|
|
}
|
|
for (int i = 0 ; i < mod ; i++){
|
|
N_siz[i] ++ ;
|
|
}
|
|
|
|
N_str [0] = 0 ;
|
|
for (int i = 1 ; i < num_devices ; i++){
|
|
N_str[i] = N_str[i - 1] + N_siz[i - 1];
|
|
}
|
|
|
|
// Allocate device memory for each GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaMalloc(&input__d[i], N_siz[i] * C* H *W * sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&filter_d[i], K * C* R *S * sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&output_d[i], N_siz[i] * K* OH*OW * sizeof(float)) );
|
|
}
|
|
//printf("\n\n Size[0] = %d, Size[1] = %d\n\n", size[0], size[1]);
|
|
}
|
|
|
|
void convolution_final(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
|
|
}
|