chundoong-lab-ta/APWS23/ans/integral_gpu_ans.cu

76 lines
2.5 KiB
Plaintext

#include <cstdio>
#include "integral.h"
#define THREADS_PER_BLOCK 1024
#define ELEMENTS_PER_BLOCK (THREADS_PER_BLOCK * 2)
#ifndef CHECK_CUDA
#define CHECK_CUDA(f) \
{ \
cudaError_t err = (f); \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
err, cudaGetErrorString(err)); \
exit(1); \
} \
}
#endif
static __device__ double f(double x) { return 4.0 / (1 + x * x); }
__global__ void integral_kernel(double *output, size_t N) {
extern __shared__ double L[];
unsigned int tid = threadIdx.x;
unsigned int offset = blockIdx.x * blockDim.x * 2;
unsigned int stride = blockDim.x;
double dx = 1.0 / (double) N;
L[tid] = 0;
unsigned int x1 = tid + offset;
unsigned int x2 = tid + stride + offset;
if (x1 < N) L[tid] += f(x1 * dx) * dx;
if (x2 < N) L[tid] += f(x2 * dx) * dx;
__syncthreads();
for (stride = blockDim.x / 2; stride > 0; stride /= 2) {
if (tid < stride) L[tid] += L[tid + stride];
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = L[0];
}
static double *output_cpu;
static double *output_gpu;
void integral_gpu_initialize(size_t num_intervals) {
CHECK_CUDA(cudaMalloc(&output_gpu, (num_intervals + ELEMENTS_PER_BLOCK - 1) /
ELEMENTS_PER_BLOCK * sizeof(double)));
output_cpu = (double *) malloc((num_intervals + ELEMENTS_PER_BLOCK - 1) /
ELEMENTS_PER_BLOCK * sizeof(double));
}
double integral_gpu(size_t num_intervals) {
size_t output_elements =
(num_intervals + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
dim3 gridDim(output_elements);
dim3 blockDim(THREADS_PER_BLOCK);
integral_kernel<<<gridDim, blockDim, THREADS_PER_BLOCK * sizeof(double), 0>>>(
output_gpu, num_intervals);
double sum = 0.0;
CHECK_CUDA(cudaMemcpy(output_cpu, output_gpu,
output_elements * sizeof(double),
cudaMemcpyDeviceToHost));
for (size_t i = 0; i < output_elements; i++) { sum += output_cpu[i]; }
return sum;
}
void integral_gpu_finalize() {
CHECK_CUDA(cudaFree(output_gpu));
free(output_cpu);
}