76 lines
2.6 KiB
Plaintext
76 lines
2.6 KiB
Plaintext
#include <cstdio>
|
|
|
|
#include "reduction.h"
|
|
|
|
#define THREADS_PER_BLOCK 1024
|
|
#define ELEMENTS_PER_BLOCK (THREADS_PER_BLOCK * 2)
|
|
|
|
#ifndef CHECK_CUDA
|
|
#define CHECK_CUDA(f) \
|
|
{ \
|
|
cudaError_t err = (f); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error at [%s:%d] %d %s\n", __FILE__, __LINE__, \
|
|
err, cudaGetErrorString(err)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
#endif
|
|
|
|
__global__ void reduce_kernel(double *input, double *output, int N) {
|
|
extern __shared__ double L[];
|
|
|
|
unsigned int tid = threadIdx.x;
|
|
unsigned int offset = blockIdx.x * blockDim.x * 2;
|
|
unsigned int stride = blockDim.x;
|
|
|
|
L[tid] = 0;
|
|
if (tid + offset < N) L[tid] += input[tid + offset];
|
|
if (tid + stride + offset < N) L[tid] += input[tid + stride + offset];
|
|
__syncthreads();
|
|
|
|
for (stride = blockDim.x / 2; stride > 0; stride /= 2) {
|
|
if (tid < stride) L[tid] += L[tid + stride];
|
|
__syncthreads();
|
|
}
|
|
|
|
if (tid == 0) output[blockIdx.x] = L[0];
|
|
}
|
|
|
|
static double *output_cpu;
|
|
static double *input_gpu, *output_gpu;
|
|
|
|
void reduction_gpu_initialize(size_t num_elements) {
|
|
CHECK_CUDA(cudaMalloc(&input_gpu, num_elements * sizeof(double)));
|
|
CHECK_CUDA(cudaMalloc(&output_gpu, (num_elements + ELEMENTS_PER_BLOCK - 1) /
|
|
ELEMENTS_PER_BLOCK * sizeof(double)));
|
|
output_cpu = (double *) malloc((num_elements + ELEMENTS_PER_BLOCK - 1) /
|
|
ELEMENTS_PER_BLOCK * sizeof(double));
|
|
}
|
|
|
|
double reduction_gpu(double *A, size_t num_elements) {
|
|
size_t output_elements =
|
|
(num_elements + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK;
|
|
|
|
cudaMemcpy(input_gpu, A, num_elements * sizeof(double),
|
|
cudaMemcpyHostToDevice);
|
|
|
|
dim3 gridDim(output_elements);
|
|
dim3 blockDim(THREADS_PER_BLOCK);
|
|
reduce_kernel<<<gridDim, blockDim, THREADS_PER_BLOCK * sizeof(double), 0>>>(
|
|
input_gpu, output_gpu, num_elements);
|
|
|
|
double sum = 0.0;
|
|
CHECK_CUDA(cudaMemcpy(output_cpu, output_gpu,
|
|
output_elements * sizeof(double),
|
|
cudaMemcpyDeviceToHost));
|
|
for (size_t i = 0; i < output_elements; i++) { sum += output_cpu[i]; }
|
|
return sum;
|
|
}
|
|
|
|
void reduction_gpu_finalize() {
|
|
CHECK_CUDA(cudaFree(input_gpu));
|
|
CHECK_CUDA(cudaFree(output_gpu));
|
|
free(output_cpu);
|
|
}
|