395 lines
12 KiB
Plaintext
395 lines
12 KiB
Plaintext
#include "convolution.h"
|
|
#include <mpi.h>
|
|
#include <stdio.h>
|
|
#include <cuda_runtime.h>
|
|
#include "util.h"
|
|
|
|
#define CUDA_CALL(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
// definitions
|
|
#define MAX_NODE_COUNT 2
|
|
#define MAX_GPU_COUNT 4
|
|
#define SGEMM_BLOCKSIZE 16
|
|
#define IM2COL_NTHREADS 1024
|
|
|
|
// MPI's per-node variables
|
|
static int num_devices;
|
|
static float *input, *output, *filter;
|
|
static int N, C, H, W;
|
|
static int K, R, S;
|
|
static int OH, OW;
|
|
static int pad, dilation, stride;
|
|
static int mpi_rank, mpi_world_size;
|
|
static int wbegin[MAX_NODE_COUNT], wend[MAX_NODE_COUNT];
|
|
static int wsize[MAX_NODE_COUNT];
|
|
static int mbegin[MAX_NODE_COUNT][MAX_GPU_COUNT];
|
|
static int mend[MAX_NODE_COUNT][MAX_GPU_COUNT];
|
|
static int msize[MAX_NODE_COUNT][MAX_GPU_COUNT];
|
|
static int rounded_M, rounded_N;
|
|
|
|
// GPU devices' variables
|
|
static float *h_input[MAX_GPU_COUNT];
|
|
static float *h_output[MAX_GPU_COUNT];
|
|
static float *d_input[MAX_GPU_COUNT];
|
|
static float *d_filter[MAX_GPU_COUNT];
|
|
static float *d_col[MAX_GPU_COUNT];
|
|
static float *d_output[MAX_GPU_COUNT];
|
|
|
|
static cudaStream_t stream[MAX_GPU_COUNT];
|
|
|
|
// function declaration
|
|
__global__ void im2col_gpu_kernel(const int n, const float *data_im,
|
|
const int height, const int width, const int kernel_h, const int kernel_w,
|
|
const int pad, const int stride, const int dilation, const int height_col,
|
|
const int width_col, float* data_col);
|
|
|
|
__global__ void sgemm_gpu_kernel(float *A, float *B, float *C,
|
|
int M, int N, int K);
|
|
|
|
// static function
|
|
static void convolution_gpu(void) {
|
|
int n;
|
|
|
|
int im2col_kernels = C * OH * OW;
|
|
int im2col_blocks = (im2col_kernels + IM2COL_NTHREADS - 1) / IM2COL_NTHREADS;
|
|
|
|
dim3 sgemm_blockDim(SGEMM_BLOCKSIZE, SGEMM_BLOCKSIZE);
|
|
dim3 sgemm_gridDim(rounded_N/SGEMM_BLOCKSIZE, rounded_M/SGEMM_BLOCKSIZE);
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaMemcpy(d_filter[i], filter, K * C * R * S * sizeof(float),
|
|
cudaMemcpyHostToDevice) );
|
|
}
|
|
|
|
for (n = 0; n < msize[mpi_rank][0]; n++) {
|
|
for (int i = 0; i < num_devices; i++) {
|
|
if (n >= msize[mpi_rank][i]) {
|
|
continue;
|
|
}
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaMemcpyAsync(d_input[i],
|
|
h_input[i] + n * C * H * W,
|
|
C * H * W * sizeof(float),
|
|
cudaMemcpyHostToDevice, stream[i]) );
|
|
|
|
im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[i]>>>(
|
|
im2col_kernels,
|
|
d_input[i],
|
|
H, W, R, S,
|
|
pad, stride, dilation, OH, OW, d_col[i]);
|
|
|
|
sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[i]>>>(
|
|
d_filter[i], d_col[i],
|
|
d_output[i],
|
|
K, OH * OW, R * S * C);
|
|
|
|
CUDA_CALL( cudaMemcpyAsync(h_output[i] + n * K * OH * OW,
|
|
d_output[i],
|
|
K * OH * OW * sizeof(float),
|
|
cudaMemcpyDeviceToHost, stream[i]) );
|
|
}
|
|
}
|
|
|
|
int d = num_devices - 1;
|
|
for (n = n; n < msize[mpi_rank][d]; n++) {
|
|
CUDA_CALL( cudaSetDevice(d) );
|
|
CUDA_CALL( cudaMemcpyAsync(d_input[d],
|
|
h_input[d] + n * C * H * W,
|
|
C * H * W * sizeof(float),
|
|
cudaMemcpyHostToDevice, stream[d]) );
|
|
|
|
im2col_gpu_kernel<<<im2col_blocks, IM2COL_NTHREADS, 0, stream[d]>>>(
|
|
im2col_kernels,
|
|
d_input[d],
|
|
H, W, R, S,
|
|
pad, stride, dilation, OH, OW, d_col[d]);
|
|
|
|
sgemm_gpu_kernel<<<sgemm_gridDim, sgemm_blockDim, 0, stream[d]>>>(
|
|
d_filter[d], d_col[d],
|
|
d_output[d],
|
|
K, OH * OW, R * S * C);
|
|
|
|
CUDA_CALL( cudaMemcpyAsync(h_output[d] + n * K * OH * OW,
|
|
d_output[d],
|
|
K * OH * OW * sizeof(float),
|
|
cudaMemcpyDeviceToHost, stream[d]) );
|
|
}
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
void convolution(
|
|
float *_input, float *_output, float *_filter,
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
input = _input;
|
|
output = _output;
|
|
filter = _filter;
|
|
|
|
MPI_Request req[MAX_NODE_COUNT][MAX_GPU_COUNT];
|
|
|
|
if (mpi_rank != 0) {
|
|
alloc_tensor(&filter, K, C, R, S);
|
|
}
|
|
|
|
// scattering
|
|
if (mpi_rank == 0) {
|
|
for (int i = 1; i < mpi_world_size; i++) {
|
|
for (int j = 0; j < num_devices; j++) {
|
|
MPI_Isend(input + (wbegin[i] + mbegin[i][j]) * C * H * W,
|
|
msize[i][j] * C * H * W,
|
|
MPI_FLOAT, i, 0, MPI_COMM_WORLD, &req[i - 1][j]);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
memcpy(h_input[i],
|
|
input + (wbegin[0] + mbegin[0][i]) * C * H * W,
|
|
msize[0][i] * C * H * W * sizeof(float));
|
|
}
|
|
|
|
if (mpi_world_size > 1) {
|
|
MPI_Waitall((mpi_world_size - 1) * num_devices,
|
|
&req[0][0],
|
|
MPI_STATUSES_IGNORE);
|
|
}
|
|
} else {
|
|
for (int i = 0; i < num_devices; i++) {
|
|
MPI_Irecv(h_input[i],
|
|
msize[mpi_rank][i] * C * H * W * sizeof(float),
|
|
MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &req[0][i]);
|
|
}
|
|
|
|
if (wsize[mpi_rank] > 0) {
|
|
MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);
|
|
}
|
|
}
|
|
|
|
// broadcasting
|
|
if (mpi_world_size > 1) {
|
|
MPI_Bcast(filter, K * C * R * S, MPI_FLOAT, 0, MPI_COMM_WORLD);
|
|
}
|
|
|
|
// computation
|
|
if (wsize[mpi_rank] > 0){
|
|
convolution_gpu();
|
|
}
|
|
|
|
// gathering
|
|
if (mpi_rank == 0) {
|
|
for (int i = 1; i < mpi_world_size; i++) {
|
|
for (int j = 0; j < num_devices; j++) {
|
|
MPI_Irecv(output + (wbegin[i] + mbegin[i][j]) * K * OH * OW,
|
|
msize[i][j] * K * OH * OW,
|
|
MPI_FLOAT, i, 1, MPI_COMM_WORLD, &req[i - 1][j]);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
memcpy(output + (wbegin[0] + mbegin[0][i]) * K * OH * OW,
|
|
h_output[i],
|
|
msize[0][i] * K * OH * OW * sizeof(float));
|
|
}
|
|
|
|
MPI_Waitall((mpi_world_size - 1) * num_devices,
|
|
&req[0][0], MPI_STATUSES_IGNORE);
|
|
} else {
|
|
for (int i = 0; i < num_devices; i++) {
|
|
MPI_Isend(h_output[i],
|
|
msize[mpi_rank][i] * K * OH * OW,
|
|
MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &req[0][i]);
|
|
}
|
|
|
|
MPI_Waitall(num_devices, &req[0][0], MPI_STATUSES_IGNORE);
|
|
}
|
|
}
|
|
|
|
void convolution_init(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
N = _N; C = _C; H = _H; W = _W;
|
|
K = _K; R = _R; S = _S;
|
|
pad = _pad;
|
|
dilation = _dilation;
|
|
stride = _stride;
|
|
|
|
OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
|
|
OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;
|
|
|
|
rounded_M = (K + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;
|
|
rounded_N = ((OH * OW) + SGEMM_BLOCKSIZE - 1) / SGEMM_BLOCKSIZE * SGEMM_BLOCKSIZE;
|
|
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
|
|
MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);
|
|
|
|
if (mpi_rank == 0) {
|
|
printf("Using %d compute nodes\n", mpi_world_size);
|
|
}
|
|
|
|
CUDA_CALL( cudaGetDeviceCount(&num_devices) );
|
|
|
|
printf("Node %d: Using %d devices\n", mpi_rank, num_devices);
|
|
|
|
if (num_devices <= 0) {
|
|
printf("No CUDA device. Aborting\n");
|
|
exit(1);
|
|
}
|
|
|
|
// work distribution to nodes
|
|
for (int i = 0; i < mpi_world_size; i++) {
|
|
wbegin[i] = N / mpi_world_size * i;
|
|
wend[i] = N / mpi_world_size * (i + 1);
|
|
}
|
|
wend[mpi_world_size - 1] = N;
|
|
|
|
for (int i = 0; i < mpi_world_size; i++) {
|
|
wsize[i] = wend[i] - wbegin[i];
|
|
}
|
|
|
|
// work distribution to devices
|
|
for (int i = 0; i < mpi_world_size; i++) {
|
|
for (int j = 0; j < num_devices; j++) {
|
|
mbegin[i][j] = (wsize[i] / num_devices) * j;
|
|
mend[i][j] = (wsize[i] / num_devices) * (j + 1);
|
|
}
|
|
mend[i][num_devices - 1] = wsize[i];
|
|
}
|
|
|
|
for (int i = 0; i < mpi_world_size; i++) {
|
|
for (int j = 0; j < num_devices; j++) {
|
|
msize[i][j] = mend[i][j] - mbegin[i][j];
|
|
}
|
|
}
|
|
|
|
// memory allocation
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
|
|
CUDA_CALL( cudaStreamCreate(&stream[i]) );
|
|
|
|
CUDA_CALL( cudaMallocHost(&h_input[i], msize[mpi_rank][i] * C * H * W * sizeof(float)) );
|
|
CUDA_CALL( cudaMallocHost(&h_output[i], msize[mpi_rank][i] * K * OH * OW * sizeof(float)) );
|
|
|
|
CUDA_CALL( cudaMalloc(&d_input[i], C * H * W * sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&d_filter[i], rounded_M * C * R * S * sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&d_output[i], rounded_M * rounded_N * sizeof(float)) );
|
|
CUDA_CALL( cudaMalloc(&d_col[i], R * S * C * OH * OW * sizeof(float)) );
|
|
|
|
CUDA_CALL( cudaMemset(d_filter[i], 0, rounded_M * C * R * S * sizeof(float)) );
|
|
CUDA_CALL( cudaMemset(d_output[i], 0, rounded_M * rounded_N * sizeof(float)) );
|
|
CUDA_CALL( cudaMemset(d_col[i], 0, R * S * C * OH * OW * sizeof(float)) );
|
|
}
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
CUDA_CALL( cudaDeviceSynchronize() );
|
|
}
|
|
}
|
|
|
|
void convolution_final(
|
|
int _N, int _C, int _H, int _W,
|
|
int _K, int _R, int _S,
|
|
int _pad, int _dilation, int _stride) {
|
|
|
|
for (int i = 0; i < num_devices; i++) {
|
|
CUDA_CALL( cudaSetDevice(i) );
|
|
|
|
CUDA_CALL( cudaFree(d_input[i]) );
|
|
CUDA_CALL( cudaFree(d_filter[i]) );
|
|
CUDA_CALL( cudaFree(d_output[i]) );
|
|
CUDA_CALL( cudaFree(d_col[i]) );
|
|
|
|
CUDA_CALL( cudaFreeHost(h_input[i]) );
|
|
CUDA_CALL( cudaFreeHost(h_output[i]) );
|
|
|
|
CUDA_CALL( cudaStreamDestroy(stream[i]) );
|
|
}
|
|
|
|
}
|
|
|
|
__global__ void im2col_gpu_kernel(const int n, const float *data_im,
|
|
const int height, const int width, const int kernel_h, const int kernel_w,
|
|
const int pad, const int stride, const int dilation, const int height_col,
|
|
const int width_col, float* data_col) {
|
|
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
|
|
index < n;
|
|
index += blockDim.x * gridDim.x) {
|
|
const int h_index = index / width_col;
|
|
const int h_col = h_index % height_col;
|
|
const int w_col = index % width_col;
|
|
const int c_im = h_index / height_col;
|
|
const int c_col = c_im * kernel_h * kernel_w;
|
|
const int h_offset = h_col * stride - pad;
|
|
const int w_offset = w_col * stride - pad;
|
|
float *data_col_ptr = data_col;
|
|
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
|
|
const float *data_im_ptr = data_im;
|
|
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
|
|
|
|
for (int i = 0; i < kernel_h; ++i) {
|
|
for (int j = 0; j < kernel_w; ++j) {
|
|
int h_im = h_offset + i * dilation;
|
|
int w_im = w_offset + j * dilation;
|
|
*data_col_ptr =
|
|
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
|
|
data_im_ptr[i * dilation * width + j * dilation] : 0;
|
|
data_col_ptr += height_col * width_col;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void sgemm_gpu_kernel(float *A, float *B, float *C,
|
|
int M, int N, int K) {
|
|
int bx = blockIdx.x;
|
|
int by = blockIdx.y;
|
|
|
|
int tx = threadIdx.x;
|
|
int ty = threadIdx.y;
|
|
|
|
int aBegin = K * SGEMM_BLOCKSIZE * by;
|
|
int aEnd = aBegin + K - 1;
|
|
int aStep = SGEMM_BLOCKSIZE;
|
|
|
|
int bBegin = SGEMM_BLOCKSIZE * bx;
|
|
int bStep = SGEMM_BLOCKSIZE * N;
|
|
|
|
float Csub = 0;
|
|
|
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
|
__shared__ float As[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];
|
|
__shared__ float Bs[SGEMM_BLOCKSIZE][SGEMM_BLOCKSIZE];
|
|
|
|
As[ty][tx] = A[a + K * ty + tx];
|
|
Bs[ty][tx] = B[b + N * ty + tx];
|
|
|
|
__syncthreads();
|
|
|
|
#pragma unroll
|
|
for (int k = 0; k < SGEMM_BLOCKSIZE; ++k) {
|
|
Csub += As[ty][k] * Bs[k][tx];
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
if (bx * SGEMM_BLOCKSIZE + tx < N) {
|
|
int c = N * SGEMM_BLOCKSIZE * by + SGEMM_BLOCKSIZE * bx;
|
|
C[c + N * ty + tx] = Csub;
|
|
}
|
|
}
|
|
|