chundoong-lab-ta/APWS23/ans/convolution_gpu_ans.cpp

98 lines
4.0 KiB
C++

#include <cstdio>
#include <cstdlib>
#include "convolution.h"
#define CHECK_CUDA(call) \
do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
__global__ void convolution_kernel(float *I, float *F, float *O, int N, int C,
int H, int W, int K, int R, int S, int pad_h,
int pad_w, int stride_h, int stride_w,
int dilation_h, int dilation_w) {
const int ON = N;
const int OC = K;
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
const int on = blockDim.x * blockIdx.x + threadIdx.x;
const int oc = blockDim.y * blockIdx.y + threadIdx.y;
if (on >= ON || oc >= OC) return;
for (int oh = 0; oh < OH; ++oh) {
for (int ow = 0; ow < OW; ++ow) {
float sum = 0;
for (int c = 0; c < C; ++c) {
for (int r = 0; r < R; ++r) {
for (int s = 0; s < S; ++s) {
const int n = on;
const int h = oh * stride_h - pad_h + r * dilation_h;
const int w = ow * stride_w - pad_w + s * dilation_w;
const int k = oc;
if (h < 0 || h >= H || w < 0 || w >= W) continue;
sum += I[((n * C + c) * H + h) * W + w] *
F[((k * C + c) * R + r) * S + s];
}
}
}
O[((on * OC + oc) * OH + oh) * OW + ow] = sum;
}
}
}
float *I_gpu, *F_gpu, *O_gpu;
void convolution_gpu_initialize(int N, int C, int H, int W, int K, int R, int S,
int pad_h, int pad_w, int stride_h,
int stride_w, int dilation_h, int dilation_w) {
const int ON = N;
const int OC = K;
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
CHECK_CUDA(cudaMalloc(&I_gpu, N * C * H * W * sizeof(float)));
CHECK_CUDA(cudaMalloc(&F_gpu, K * C * R * S * sizeof(float)));
CHECK_CUDA(cudaMalloc(&O_gpu, ON * OC * OH * OW * sizeof(float)));
return;
}
void convolution_gpu(float *I, float *F, float *O, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w, int stride_h,
int stride_w, int dilation_h, int dilation_w) {
const int ON = N;
const int OC = K;
const int OH = 1 + (H + 2 * pad_h - (((R - 1) * dilation_h) + 1)) / stride_h;
const int OW = 1 + (W + 2 * pad_w - (((S - 1) * dilation_w) + 1)) / stride_w;
CHECK_CUDA(cudaMemcpy(I_gpu, I, N * C * H * W * sizeof(float),
cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(F_gpu, F, K * C * R * S * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blockDim(32, 32);
dim3 gridDim((N + 32 - 1) / 32, (K + 32 - 1) / 32);
convolution_kernel<<<gridDim, blockDim>>>(I_gpu, F_gpu, O_gpu, N, C, H, W, K,
R, S, pad_h, pad_w, stride_h,
stride_w, dilation_h, dilation_w);
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaMemcpy(O, O_gpu, ON * OC * OH * OW * sizeof(float),
cudaMemcpyDeviceToHost));
}
void convolution_gpu_finalize(int N, int C, int H, int W, int K, int R, int S,
int pad_h, int pad_w, int stride_h, int stride_w,
int dilation_h, int dilation_w) {
CHECK_CUDA(cudaFree(I_gpu));
CHECK_CUDA(cudaFree(F_gpu));
CHECK_CUDA(cudaFree(O_gpu));
return;
}