chundoong-lab-ta/APWS23/sum-reduction/reduction_gpu.cu

#include <cstdio>

#include "reduction.h"

#define THREADS_PER_BLOCK 1024
#define ELEMENTS_PER_BLOCK (THREADS_PER_BLOCK * 2)

#ifndef CHECK_CUDA
#define CHECK_CUDA(f)                                                      \
  {                                                                        \
    cudaError_t err = (f);                                                 \
    if (err != cudaSuccess) {                                              \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
              err, cudaGetErrorString(err));                               \
      exit(1);                                                             \
    }                                                                      \
  }
#endif

__global__ void reduce_kernel(double *input, double *output, int N) {
  extern __shared__ double L[];

  unsigned int tid = threadIdx.x;
  unsigned int offset = blockIdx.x * blockDim.x * 2;
  unsigned int stride = blockDim.x;

  L[tid] = 0;
  if (tid + offset < N) L[tid] += input[tid + offset];
  if (tid + stride + offset < N) L[tid] += input[tid + stride + offset];
  __syncthreads();

  for (stride = blockDim.x / 2; stride > 0; stride /= 2) {
    if (tid < stride) L[tid] += L[tid + stride];
    __syncthreads();
  }

  if (tid == 0) output[blockIdx.x] = L[0];
}

static double *output_cpu;
static double *input_gpu, *output_gpu;

void reduction_gpu_initialize(size_t num_elements) {
  CHECK_CUDA(cudaMalloc(&input_gpu, num_elements * sizeof(double)));
  CHECK_CUDA(cudaMalloc(&output_gpu, (num_elements + ELEMENTS_PER_BLOCK - 1) /
                                         ELEMENTS_PER_BLOCK * sizeof(double)));
  output_cpu = (double *) malloc((num_elements + ELEMENTS_PER_BLOCK - 1) /
                                 ELEMENTS_PER_BLOCK * sizeof(double));
}

double reduction_gpu(double *A, size_t num_elements) {
  size_t output_elements =
      (num_elements + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;

  cudaMemcpy(input_gpu, A, num_elements * sizeof(double),
             cudaMemcpyHostToDevice);

  dim3 gridDim(output_elements);
  dim3 blockDim(THREADS_PER_BLOCK);
  reduce_kernel<<<gridDim, blockDim, THREADS_PER_BLOCK * sizeof(double), 0>>>(
      input_gpu, output_gpu, num_elements);

  double sum = 0.0;
  CHECK_CUDA(cudaMemcpy(output_cpu, output_gpu,
                        output_elements * sizeof(double),
                        cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < output_elements; i++) { sum += output_cpu[i]; }
  return sum;
}

void reduction_gpu_finalize() {
  CHECK_CUDA(cudaFree(input_gpu));
  CHECK_CUDA(cudaFree(output_gpu));
  free(output_cpu);
}