287 lines
8.8 KiB
Plaintext
287 lines
8.8 KiB
Plaintext
#include "util.h"
|
|
#include <mpi.h>
|
|
|
|
static float *input, *output, *filter;
|
|
static int N, C, H, W;
|
|
static int K, R, S;
|
|
static int OH, OW;
|
|
static int pad;
|
|
static int dilation;
|
|
static int stride;
|
|
static int mpi_rank, mpi_world_size;
|
|
|
|
static int filter_size, im_size, col_size, res_size;
|
|
|
|
#define MAX_NODES 2
|
|
static int ns[MAX_NODES], ne[MAX_NODES];
|
|
|
|
|
|
#define CUDA_CALL(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
#define TS (32)
|
|
#define WPT (16)
|
|
#define RTS (2)
|
|
|
|
#define MAX_NUM_GPU 4
|
|
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];
|
|
int num_devices = 0;
|
|
|
|
// Array of device (GPU) pointers
|
|
static float *input_d[MAX_NUM_GPU];
|
|
static float *input_col_d[MAX_NUM_GPU];
|
|
static float *filter_d[MAX_NUM_GPU];
|
|
static float *output_d[MAX_NUM_GPU];
|
|
|
|
__global__ void im2col_gpu_kernel(const int n, const float* data_im,
|
|
const int height, const int width, const int kernel_h, const int kernel_w,
|
|
const int pad_h, const int pad_w,
|
|
const int stride_h, const int stride_w,
|
|
const int dilation_h, const int dilation_w,
|
|
const int height_col, const int width_col,
|
|
float* data_col) {
|
|
for (int index = blockIdx.x * blockDim.x + threadIdx.x; \
|
|
index < (n); \
|
|
index += blockDim.x * gridDim.x) {
|
|
const int h_index = index / width_col;
|
|
const int h_col = h_index % height_col;
|
|
const int w_col = index % width_col;
|
|
const int c_im = h_index / height_col;
|
|
const int c_col = c_im * kernel_h * kernel_w;
|
|
const int h_offset = h_col * stride_h - pad_h;
|
|
const int w_offset = w_col * stride_w - pad_w;
|
|
float* data_col_ptr = data_col;
|
|
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
|
|
const float* data_im_ptr = data_im;
|
|
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
|
|
for (int i = 0; i < kernel_h; ++i) {
|
|
for (int j = 0; j < kernel_w; ++j) {
|
|
int h_im = h_offset + i * dilation_h;
|
|
int w_im = w_offset + j * dilation_w;
|
|
*data_col_ptr =
|
|
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
|
|
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
|
|
data_col_ptr += height_col * width_col;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void sgemm(float *A, float *B, float *C, int M, int N, int K, int numTiles) {
|
|
const int row = threadIdx.x;
|
|
const int col = threadIdx.y;
|
|
const int globalRow = TS * blockIdx.x + threadIdx.x;
|
|
const int globalCol = TS * blockIdx.y + threadIdx.y;
|
|
|
|
__shared__ float Asub[TS][TS];
|
|
__shared__ float Bsub[TS][TS];
|
|
|
|
float acc[WPT];
|
|
for(int w=0; w<WPT; w++) {
|
|
acc[w] = 0.0f;
|
|
}
|
|
|
|
for(int t=0; t<numTiles; t++) {
|
|
for(int w=0; w<WPT; w++) {
|
|
const int tiledRow = TS*t + row;
|
|
const int tiledCol = TS*t + col;
|
|
if((globalRow + w*RTS) < M && tiledCol < K)
|
|
Asub[w*RTS + row][col] = A[(globalRow + w*RTS)*K + tiledCol];
|
|
else
|
|
Asub[w*RTS + row][col] = 0.0f;
|
|
if((tiledRow + w*RTS) < K && globalCol < N)
|
|
Bsub[row + w*RTS][col] = B[(tiledRow + w*RTS)*N + globalCol];
|
|
else
|
|
Bsub[row + w*RTS][col] = 0.0f;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(int k=0; k<TS; k++) {
|
|
for(int w=0; w<WPT; w++) {
|
|
acc[w] += Asub[row + w*RTS][k] * Bsub[k][col];
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
for(int w=0; w<WPT; w++) {
|
|
if((globalRow + w*RTS) < M && globalCol < N)
|
|
C[(globalRow + w*RTS)*N + globalCol] = acc[w];
|
|
}
|
|
}
|
|
|
|
void im2col_gpu(const float* data_im, const int channels,
|
|
const int height, const int width, const int kernel_h, const int kernel_w,
|
|
const int pad_h, const int pad_w,
|
|
const int stride_h, const int stride_w,
|
|
const int dilation_h, const int dilation_w,
|
|
float* data_col) {
|
|
// We are going to launch channels * height_col * width_col kernels, each
|
|
// kernel responsible for copying a single-channel grid.
|
|
int height_col = (height + 2 * pad_h -
|
|
(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
|
|
int width_col = (width + 2 * pad_w -
|
|
(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
|
|
int num_kernels = channels * height_col * width_col;
|
|
// NOLINT_NEXT_LINE(whitespace/operators)
|
|
const int num_threads = 512;
|
|
int num_blocks = (num_kernels + num_threads - 1) / num_threads;
|
|
im2col_gpu_kernel<<<num_blocks, num_threads>>>(
|
|
num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
|
|
pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
|
|
width_col, data_col);
|
|
}
|
|
|
|
void convolution(
|
|
float *_input, float *_output, float *_filter,
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
input = _input;
|
|
output = _output;
|
|
filter = _filter;
|
|
|
|
if(mpi_rank != 0) {
|
|
alloc_tensor(&input, N, C, H, W);
|
|
alloc_tensor(&output, N, K, OH, OW);
|
|
alloc_tensor(&filter, K, C, R, S);
|
|
}
|
|
|
|
// Scatter input
|
|
if(mpi_rank == 0) {
|
|
for(int i=1; i < mpi_world_size; i++) {
|
|
MPI_Send(input + ns[i]*im_size, (ne[i]-ns[i])*im_size, MPI_FLOAT, i, 0,
|
|
MPI_COMM_WORLD);
|
|
}
|
|
}
|
|
else {
|
|
MPI_Recv(input + ns[mpi_rank]*im_size, (ne[mpi_rank]-ns[mpi_rank])*im_size, MPI_FLOAT,
|
|
0, 0, MPI_COMM_WORLD, nullptr);
|
|
}
|
|
|
|
// Broadcast filter
|
|
MPI_Bcast(filter, filter_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
|
|
// Upload A and B matrix to every GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaMemcpy(input_d[i], input+(ns[mpi_rank]+Mbegin[i])*im_size,
|
|
(Mend[i]-Mbegin[i])*im_size*sizeof(float), cudaMemcpyHostToDevice) );
|
|
CUDA_CALL( cudaMemcpy(filter_d[i], filter, filter_size*sizeof(float),
|
|
cudaMemcpyHostToDevice) );
|
|
}
|
|
|
|
int M_XL = (K+TS-1)/TS*TS;
|
|
int N_XL = (OH*OW+TS-1)/TS*TS;
|
|
int K_XL = (C*R*S+TS-1)/TS*TS;
|
|
int numTiles = K_XL/TS;
|
|
dim3 blockDim(RTS, TS, 1);
|
|
dim3 gridDim(M_XL/TS, N_XL/TS, 1);
|
|
// Launch kernel on every GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
for(int j = 0; j < Mend[i]-Mbegin[i]; j++) {
|
|
im2col_gpu(input_d[i]+j*im_size, C, H, W, R, S, pad, pad, stride, stride, dilation, dilation, input_col_d[i]);
|
|
sgemm<<<gridDim, blockDim>>>(filter_d[i], input_col_d[i], output_d[i]+j*res_size, K, OH*OW, C*R*S, numTiles);
|
|
}
|
|
}
|
|
|
|
// Download C matrix from GPUs
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaMemcpy(output + (ns[mpi_rank]+Mbegin[i])*res_size, output_d[i],
|
|
(Mend[i]-Mbegin[i])*res_size*sizeof(float),
|
|
cudaMemcpyDeviceToHost) );
|
|
}
|
|
|
|
// Gather C
|
|
if (mpi_rank == 0) {
|
|
for (int i = 1; i < mpi_world_size; i++) {
|
|
MPI_Recv(output + ns[i]*res_size, (ne[i]-ns[i])*res_size, MPI_FLOAT, i, 0,
|
|
MPI_COMM_WORLD, nullptr);
|
|
}
|
|
} else {
|
|
MPI_Send(output + ns[mpi_rank]*res_size, (ne[mpi_rank]-ns[mpi_rank])*res_size, MPI_FLOAT, 0, 0,
|
|
MPI_COMM_WORLD);
|
|
}
|
|
|
|
// DO NOT REMOVE; NEEDED FOR TIME MEASURE
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
void convolution_init(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
N = _N; C = _C; H = _H; W = _W;
|
|
K = _K; R = _R; S = _S;
|
|
pad = _pad;
|
|
dilation = _dilation;
|
|
stride = _stride;
|
|
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
|
|
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
|
|
|
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
|
|
|
|
//printf("Using %d devices\n", num_devices);
|
|
for (int i = 0; i < num_devices; i++) {
|
|
cudaDeviceProp prop;
|
|
CUDA_CALL( cudaGetDeviceProperties(&prop, i) );
|
|
|
|
// Try printing more detailed information here
|
|
//printf("[GPU %d] %s\n", i, prop.name);
|
|
}
|
|
|
|
if (num_devices <= 0) {
|
|
printf("No CUDA device found. Aborting\n");
|
|
exit(1);
|
|
}
|
|
|
|
for(int i=0; i < mpi_world_size; i++) {
|
|
ns[i] = N/mpi_world_size *i;
|
|
ne[i] = N/mpi_world_size*(i+1);
|
|
}
|
|
ne[mpi_world_size-1] = N;
|
|
|
|
// Setup problem size for each GPU
|
|
int batch_size = ne[mpi_rank]-ns[mpi_rank];
|
|
for (int i = 0; i < num_devices; i++) {
|
|
Mbegin[i] = (batch_size / num_devices) * i;
|
|
Mend[i] = (batch_size / num_devices) * (i + 1);
|
|
}
|
|
Mend[num_devices - 1] = batch_size;
|
|
|
|
im_size = C*H*W;
|
|
col_size = C*R*S*OH*OW;
|
|
res_size = K*OH*OW;
|
|
filter_size = K*C*R*S;
|
|
// Allocate device memory for each GPU
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaMalloc(&input_d[i], (Mend[i]-Mbegin[i])*im_size*sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&input_col_d[i], col_size*sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&filter_d[i], filter_size*sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&output_d[i], (Mend[i]-Mbegin[i])*res_size*sizeof(float)) );
|
|
}
|
|
}
|
|
|
|
void convolution_final(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
}
|