chundoong-lab-ta/APWS23/ans/integral_gpu_ans.cu

#include "integral.h"

#include <cstdio>

#define THREADS_PER_BLOCK 1024
#define ELEMENTS_PER_BLOCK (THREADS_PER_BLOCK * 2)

#ifndef CHECK_CUDA
#define CHECK_CUDA(f)                                                       \
  {                                                                        \
    cudaError_t err = (f);                                                 \
    if (err != cudaSuccess) {                                              \
      fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
              err, cudaGetErrorString(err));                               \
      exit(1);                                                             \
    }                                                                      \
  }
#endif

static __device__ double f(double x) { return 4.0 / (1 + x * x); }

__global__ void integral_kernel(double *output, size_t N) {
  extern __shared__ double L[];

  unsigned int tid = threadIdx.x;
  unsigned int offset = blockIdx.x * blockDim.x * 2;
  unsigned int stride = blockDim.x;

  double dx = 1.0 / (double) N;
  L[tid] = 0;

  unsigned int x1 = tid + offset;
  unsigned int x2 = tid + stride + offset;
  if (x1 < N) L[tid] += f(x1 * dx) * dx;
  if (x2 < N) L[tid] += f(x2 * dx) * dx;
  __syncthreads();

  for (stride = blockDim.x / 2; stride > 0; stride /= 2) {
    if (tid < stride) L[tid] += L[tid + stride];
    __syncthreads();
  }

  if (tid == 0) output[blockIdx.x] = L[0];
}

static double *output_cpu;
static double *output_gpu;

void integral_gpu_initialize(size_t num_intervals) {
  CHECK_CUDA(cudaMalloc(&output_gpu, (num_intervals + ELEMENTS_PER_BLOCK - 1) /
                                        ELEMENTS_PER_BLOCK * sizeof(double)));
  output_cpu = (double *) malloc((num_intervals + ELEMENTS_PER_BLOCK - 1) /
                                  ELEMENTS_PER_BLOCK * sizeof(double));
}

double integral_gpu(size_t num_intervals) {
  size_t output_elements =
  (num_intervals + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
  dim3 gridDim(output_elements);
  dim3 blockDim(THREADS_PER_BLOCK);
  integral_kernel<<<gridDim, blockDim, THREADS_PER_BLOCK * sizeof(double), 0>>>(
    output_gpu, num_intervals);

  double sum = 0.0;
  CHECK_CUDA(cudaMemcpy(output_cpu, output_gpu, output_elements * sizeof(double),
             cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < output_elements; i++) { sum += output_cpu[i]; }
  return sum;
}

void integral_gpu_finalize() {
  CHECK_CUDA(cudaFree(output_gpu));
  free(output_cpu);
}
Various update. 2023-02-09 01:28:51 +09:00			`#include "integral.h"`

Integral ans update. 2023-02-14 03:31:48 +09:00			`#include <cstdio>`
matmul ans update. 2023-02-14 02:17:22 +09:00
Integral ans update. 2023-02-14 03:31:48 +09:00			`#define THREADS_PER_BLOCK 1024`
			`#define ELEMENTS_PER_BLOCK (THREADS_PER_BLOCK * 2)`
Various update. 2023-02-09 01:28:51 +09:00
Integral ans update. 2023-02-14 03:31:48 +09:00			`#ifndef CHECK_CUDA`
			`#define CHECK_CUDA(f) \`
			`{ \`
			`cudaError_t err = (f); \`
			`if (err != cudaSuccess) { \`
			`fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \`
			`err, cudaGetErrorString(err)); \`
			`exit(1); \`
			`} \`
			`}`
			`#endif`

			`static __device__ double f(double x) { return 4.0 / (1 + x * x); }`

			`__global__ void integral_kernel(double *output, size_t N) {`
			`extern __shared__ double L[];`

			`unsigned int tid = threadIdx.x;`
			`unsigned int offset = blockIdx.x * blockDim.x * 2;`
			`unsigned int stride = blockDim.x;`

			`double dx = 1.0 / (double) N;`
			`L[tid] = 0;`

			`unsigned int x1 = tid + offset;`
			`unsigned int x2 = tid + stride + offset;`
			`if (x1 < N) L[tid] += f(x1 * dx) * dx;`
			`if (x2 < N) L[tid] += f(x2 * dx) * dx;`
			`__syncthreads();`

			`for (stride = blockDim.x / 2; stride > 0; stride /= 2) {`
			`if (tid < stride) L[tid] += L[tid + stride];`
			`__syncthreads();`
			`}`

			`if (tid == 0) output[blockIdx.x] = L[0];`
			`}`

			`static double *output_cpu;`
			`static double *output_gpu;`

			`void integral_gpu_initialize(size_t num_intervals) {`
			`CHECK_CUDA(cudaMalloc(&output_gpu, (num_intervals + ELEMENTS_PER_BLOCK - 1) /`
			`ELEMENTS_PER_BLOCK * sizeof(double)));`
			`output_cpu = (double *) malloc((num_intervals + ELEMENTS_PER_BLOCK - 1) /`
			`ELEMENTS_PER_BLOCK * sizeof(double));`
			`}`
Various update. 2023-02-09 01:28:51 +09:00
			`double integral_gpu(size_t num_intervals) {`
Integral ans update. 2023-02-14 03:31:48 +09:00			`size_t output_elements =`
			`(num_intervals + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;`
			`dim3 gridDim(output_elements);`
			`dim3 blockDim(THREADS_PER_BLOCK);`
			`integral_kernel<<<gridDim, blockDim, THREADS_PER_BLOCK * sizeof(double), 0>>>(`
			`output_gpu, num_intervals);`

			`double sum = 0.0;`
			`CHECK_CUDA(cudaMemcpy(output_cpu, output_gpu, output_elements * sizeof(double),`
			`cudaMemcpyDeviceToHost));`
			`for (size_t i = 0; i < output_elements; i++) { sum += output_cpu[i]; }`
Various update. 2023-02-09 01:28:51 +09:00			`return sum;`
			`}`

Integral ans update. 2023-02-14 03:31:48 +09:00			`void integral_gpu_finalize() {`
			`CHECK_CUDA(cudaFree(output_gpu));`
			`free(output_cpu);`
			`}`