chundoong-lab-ta/APWS23/ans/image_rotation_ans.cu

66 lines
2.4 KiB
Plaintext

#include <cstdio>
#include "image_rotation.h"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void rotate_image_kernel(float *input, float *output, int W, int H,
float sin_theta, float cos_theta) {
float x0 = W / 2.0f;
float y0 = H / 2.0f;
int dest_x = blockDim.x * blockIdx.x + threadIdx.x;
int dest_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dest_x >= W || dest_y >= H) return;
float xOff = dest_x - x0;
float yOff = dest_y - y0;
int src_x = (int) (xOff * cos_theta + yOff * sin_theta + x0);
int src_y = (int) (yOff * cos_theta - xOff * sin_theta + y0);
if ((src_x >= 0) && (src_x < W) && (src_y >= 0) && (src_y < H)) {
output[dest_y * W + dest_x] = input[src_y * W + src_x];
} else {
output[dest_y * W + dest_x] = 0.0f;
}
}
float *input_gpu, *output_gpu;
void rotate_image_gpu_initialize(int image_width, int image_height) {
CHECK_CUDA(
cudaMalloc(&input_gpu, image_width * image_height * sizeof(float)));
CHECK_CUDA(
cudaMalloc(&output_gpu, image_width * image_height * sizeof(float)));
}
void rotate_image_gpu(float *input_image, float *output_image, int image_width,
int image_height, float sin_theta, float cos_theta) {
CHECK_CUDA(cudaMemcpy(input_gpu, input_image,
image_width * image_height * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blockDim(32, 32);
dim3 gridDim((image_width + 32 - 1) / 32, (image_height + 32 - 1) / 32);
rotate_image_kernel<<<gridDim, blockDim>>>(
input_gpu, output_gpu, image_width, image_height, sin_theta, cos_theta);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaMemcpy(output_image, output_gpu,
image_width * image_height * sizeof(float),
cudaMemcpyDeviceToHost));
}
void rotate_image_gpu_finalize() {
CHECK_CUDA(cudaFree(input_gpu));
CHECK_CUDA(cudaFree(output_gpu));
}