chundoong-lab-ta/SamsungDS22/submissions/final/seongje.jang/tmp-B/convolution.cu

#include "util.h"
#include <mpi.h>

static float *input, *output, *filter;
static int N, C, H, W;
static int K, R, S;
static int OH, OW;
static int pad;
static int dilation;
static int stride;
static int mpi_rank, mpi_world_size;

static int filter_size, im_size, col_size, res_size;

#define MAX_NODES 2
static int ns[MAX_NODES], ne[MAX_NODES];


#define CUDA_CALL(f)                                                           \
{                                                                            \
	cudaError_t err = (f);                                                     \
	if (err != cudaSuccess) {                                                  \
		fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__,     \
				err, cudaGetErrorString(err));                                   \
		exit(1);                                                                 \
	}                                                                          \
}

#define TS (32)
#define WPT (16)
#define RTS (2)

#define MAX_NUM_GPU 4
static int Mbegin[MAX_NUM_GPU], Mend[MAX_NUM_GPU];
int num_devices = 0;

// Array of device (GPU) pointers
static float *input_d[MAX_NUM_GPU];
static float *input_col_d[MAX_NUM_GPU];
static float *filter_d[MAX_NUM_GPU];
static float *output_d[MAX_NUM_GPU];

__global__ void im2col_gpu_kernel(const int n, const float* data_im,
		const int height, const int width, const int kernel_h, const int kernel_w,
		const int pad_h, const int pad_w,
		const int stride_h, const int stride_w,
		const int dilation_h, const int dilation_w,
		const int height_col, const int width_col,
		float* data_col) {
	for (int index = blockIdx.x * blockDim.x + threadIdx.x; \
			index < (n); \
			index += blockDim.x * gridDim.x) {
		const int h_index = index / width_col;
		const int h_col = h_index % height_col;
		const int w_col = index % width_col;
		const int c_im = h_index / height_col;
		const int c_col = c_im * kernel_h * kernel_w;
		const int h_offset = h_col * stride_h - pad_h;
		const int w_offset = w_col * stride_w - pad_w;
		float* data_col_ptr = data_col;
		data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
		const float* data_im_ptr = data_im;
		data_im_ptr += (c_im * height + h_offset) * width + w_offset;
		for (int i = 0; i < kernel_h; ++i) {
			for (int j = 0; j < kernel_w; ++j) {
				int h_im = h_offset + i * dilation_h;
				int w_im = w_offset + j * dilation_w;
				*data_col_ptr =
					(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
					data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
				data_col_ptr += height_col * width_col;
			}
		}
	}
}

__global__ void sgemm(float *A, float *B, float *C, int M, int N, int K, int numTiles) {
	const int row = threadIdx.x;
	const int col = threadIdx.y;
	const int globalRow = TS * blockIdx.x + threadIdx.x;
	const int globalCol = TS * blockIdx.y + threadIdx.y;

	__shared__ float Asub[TS][TS];
	__shared__ float Bsub[TS][TS];

	float acc[WPT];
	for(int w=0; w<WPT; w++) {
		acc[w] = 0.0f;
	}

	for(int t=0; t<numTiles; t++) {
		for(int w=0; w<WPT; w++) {
			const int tiledRow = TS*t + row;
			const int tiledCol = TS*t + col;
			if((globalRow + w*RTS) < M && tiledCol < K)
				Asub[w*RTS + row][col] = A[(globalRow + w*RTS)*K + tiledCol];
			else
				Asub[w*RTS + row][col] = 0.0f;
			if((tiledRow + w*RTS) < K && globalCol < N)
				Bsub[row + w*RTS][col] = B[(tiledRow + w*RTS)*N + globalCol];
			else
				Bsub[row + w*RTS][col] = 0.0f;
		}

		__syncthreads();

		for(int k=0; k<TS; k++) {
			for(int w=0; w<WPT; w++) {
				acc[w] += Asub[row + w*RTS][k] * Bsub[k][col];
			}
		}

		__syncthreads();
	}

	for(int w=0; w<WPT; w++) {
		if((globalRow + w*RTS) < M && globalCol < N)
			C[(globalRow + w*RTS)*N + globalCol] = acc[w];
	}
}

void im2col_gpu(const float* data_im, const int channels,
		const int height, const int width, const int kernel_h, const int kernel_w,
		const int pad_h, const int pad_w,
		const int stride_h, const int stride_w,
		const int dilation_h, const int dilation_w,
		float* data_col) {
	// We are going to launch channels * height_col * width_col kernels, each
	// kernel responsible for copying a single-channel grid.
	int height_col = (height + 2 * pad_h -
			(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	int width_col = (width + 2 * pad_w -
			(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	int num_kernels = channels * height_col * width_col;
	// NOLINT_NEXT_LINE(whitespace/operators)
	const int num_threads = 512;
	int num_blocks = (num_kernels + num_threads - 1) / num_threads;
	im2col_gpu_kernel<<<num_blocks, num_threads>>>(
			num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
			pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
			width_col, data_col);
}

void convolution(
		float *_input, float *_output, float *_filter,
		int _N, int _C, int _H, int _W,
		int _K, int _R, int _S,
		int _pad, int _dilation, int _stride) {
	input = _input;
	output = _output;
	filter = _filter;

	if(mpi_rank != 0) {
	   alloc_tensor(&input, N, C, H, W);
	   alloc_tensor(&output, N, K, OH, OW);
	   alloc_tensor(&filter, K, C, R, S);
	}

	// Scatter input
	if(mpi_rank == 0) {
		for(int i=1; i < mpi_world_size; i++) {
		MPI_Send(input + ns[i]*im_size, (ne[i]-ns[i])*im_size, MPI_FLOAT, i, 0,
				MPI_COMM_WORLD);
		}
	}
	else {
		MPI_Recv(input + ns[mpi_rank]*im_size, (ne[mpi_rank]-ns[mpi_rank])*im_size, MPI_FLOAT,
				0, 0, MPI_COMM_WORLD, nullptr);
	}

	// Broadcast filter
	MPI_Bcast(filter, filter_size, MPI_FLOAT, 0, MPI_COMM_WORLD);

	// Upload A and B matrix to every GPU
	for (int i = 0; i < num_devices; i++) {
		CUDA_CALL( cudaMemcpy(input_d[i], input+(ns[mpi_rank]+Mbegin[i])*im_size,
					(Mend[i]-Mbegin[i])*im_size*sizeof(float), cudaMemcpyHostToDevice) );
		CUDA_CALL( cudaMemcpy(filter_d[i], filter, filter_size*sizeof(float),
					cudaMemcpyHostToDevice) );
	}

	int M_XL = (K+TS-1)/TS*TS;
	int N_XL = (OH*OW+TS-1)/TS*TS;
	int K_XL = (C*R*S+TS-1)/TS*TS;
	int numTiles = K_XL/TS;
	dim3 blockDim(RTS, TS, 1);
	dim3 gridDim(M_XL/TS, N_XL/TS, 1);
	// Launch kernel on every GPU
	for (int i = 0; i < num_devices; i++) {
		CUDA_CALL( cudaSetDevice(i) );
		for(int j = 0; j < Mend[i]-Mbegin[i]; j++) {
			im2col_gpu(input_d[i]+j*im_size, C, H, W, R, S, pad, pad, stride, stride, dilation, dilation, input_col_d[i]);
			sgemm<<<gridDim, blockDim>>>(filter_d[i], input_col_d[i], output_d[i]+j*res_size, K, OH*OW, C*R*S, numTiles);
		}
	}

	// Download C matrix from GPUs
	for (int i = 0; i < num_devices; i++) {
		CUDA_CALL( cudaMemcpy(output + (ns[mpi_rank]+Mbegin[i])*res_size, output_d[i],
					(Mend[i]-Mbegin[i])*res_size*sizeof(float),
					cudaMemcpyDeviceToHost) );
	}

	// Gather C
	if (mpi_rank == 0) {
		for (int i = 1; i < mpi_world_size; i++) {
			MPI_Recv(output + ns[i]*res_size, (ne[i]-ns[i])*res_size, MPI_FLOAT, i, 0,
					MPI_COMM_WORLD, nullptr);
		}
	} else {
		MPI_Send(output + ns[mpi_rank]*res_size, (ne[mpi_rank]-ns[mpi_rank])*res_size, MPI_FLOAT, 0, 0,
				MPI_COMM_WORLD);
	}

	// DO NOT REMOVE; NEEDED FOR TIME MEASURE
	for (int i = 0; i < num_devices; i++) {
		CUDA_CALL( cudaSetDevice(i) );
		CUDA_CALL( cudaDeviceSynchronize() );
	}
}

void convolution_init(
		int _N, int _C, int _H, int _W,
		int _K, int _R, int _S,
		int _pad, int _dilation, int _stride) {
	N = _N; C = _C; H = _H; W = _W;
	K = _K; R = _R; S = _S;
	pad = _pad;
	dilation = _dilation;
	stride = _stride;

	MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size);

	OH = (H + 2 * pad - dilation * (R - 1) - 1) / stride + 1;
	OW = (W + 2 * pad - dilation * (S - 1) - 1) / stride + 1;

	CUDA_CALL( cudaGetDeviceCount(&num_devices) );

	//printf("Using %d devices\n", num_devices);
	for (int i = 0; i < num_devices; i++) {
		cudaDeviceProp prop;
		CUDA_CALL( cudaGetDeviceProperties(&prop, i) );

		// Try printing more detailed information here
		//printf("[GPU %d] %s\n", i, prop.name);
	}

	if (num_devices <= 0) {
		printf("No CUDA device found. Aborting\n");
		exit(1);
	}

	for(int i=0; i < mpi_world_size; i++) {
		ns[i] = N/mpi_world_size *i;
		ne[i] = N/mpi_world_size*(i+1);
	}
	ne[mpi_world_size-1] = N;

	// Setup problem size for each GPU
	int batch_size = ne[mpi_rank]-ns[mpi_rank];
	for (int i = 0; i < num_devices; i++) {
		Mbegin[i] = (batch_size / num_devices) * i;
		Mend[i] = (batch_size / num_devices) * (i + 1);
	}
	Mend[num_devices - 1] = batch_size;

	im_size = C*H*W;
	col_size = C*R*S*OH*OW;
	res_size = K*OH*OW;
	filter_size = K*C*R*S;
	// Allocate device memory for each GPU
	for (int i = 0; i < num_devices; i++) {
		CUDA_CALL( cudaSetDevice(i) );
		CUDA_CALL( cudaMalloc(&input_d[i], (Mend[i]-Mbegin[i])*im_size*sizeof(float)) );
		CUDA_CALL( cudaMalloc(&input_col_d[i], col_size*sizeof(float)) );
		CUDA_CALL( cudaMalloc(&filter_d[i], filter_size*sizeof(float)) );
		CUDA_CALL( cudaMalloc(&output_d[i], (Mend[i]-Mbegin[i])*res_size*sizeof(float)) );
	}
}

void convolution_final(
		int _N, int _C, int _H, int _W,
		int _K, int _R, int _S,
		int _pad, int _dilation, int _stride) {
}