153 lines
5.6 KiB
C++
153 lines
5.6 KiB
C++
#include <cudnn.h>
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
|
|
#include "convolution.h"
|
|
|
|
#define CHECK_CUDA(call) \
|
|
do { \
|
|
cudaError_t status_ = call; \
|
|
if (status_ != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \
|
|
cudaGetErrorString(status_)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while (0)
|
|
|
|
#define CHECK_CUDNN(call) \
|
|
do { \
|
|
cudnnStatus_t status_ = call; \
|
|
if (status_ != CUDNN_STATUS_SUCCESS) { \
|
|
fprintf(stderr, "CUDNN error (%s:%d): %s\n", __FILE__, __LINE__, \
|
|
cudnnGetErrorString(status_)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while (0)
|
|
|
|
static cudnnHandle_t handle;
|
|
static cudnnTensorDescriptor_t input_desc;
|
|
static cudnnFilterDescriptor_t filter_desc;
|
|
static cudnnConvolutionDescriptor_t conv_desc;
|
|
static cudnnTensorDescriptor_t output_desc;
|
|
static int ON, OC, OH, OW;
|
|
static float *I_gpu, *F_gpu, *O_gpu, *workspace;
|
|
static cudnnConvolutionFwdAlgoPerf_t best_algo;
|
|
|
|
static const char *algo_to_string(cudnnConvolutionFwdAlgo_t algo);
|
|
|
|
void convolution_cudnn_initialize(int N, int C, int H, int W, int K, int R,
|
|
int S, int pad_h, int pad_w, int stride_h,
|
|
int stride_w, int dilation_h,
|
|
int dilation_w) {
|
|
// TODO: Implement here
|
|
|
|
// 1. Setup handle
|
|
CHECK_CUDNN(cudnnCreate(&handle));
|
|
|
|
// 2. Setup input_desc (NCHW, float)
|
|
// cudnnCreateTensorDescriptor(...);
|
|
// cudnnSetTensor4dDescriptor(input_desc, ...);
|
|
|
|
// 3. Setup filter_desc (NCHW, float)
|
|
// cudnnCreateFilterDescriptor(...);
|
|
// cudnnSetFilter4dDescriptor(filter_desc, ...);
|
|
|
|
// 4. Setup conv_desc
|
|
// cudnnCreateConvolutionDescriptor(...);
|
|
// cudnnSetConvolution2dDescriptor(conv_desc, ...);
|
|
|
|
// 5. Find output tensor dimensions
|
|
// cudnnGetConvolution2dForwardOutputDim(..., &ON, &OC, &OH, &OW);
|
|
|
|
// 6. Setup output_desc (NCHW, float)
|
|
// cudnnCreateTensorDescriptor(...);
|
|
// cudnnSetTensor4dDescriptor(output_desc, ...);
|
|
|
|
// 7. Find the number of algorithm available
|
|
int max_algo_count;
|
|
// cudnnGetConvolutionForwardAlgorithmMaxCount(handle, ...);
|
|
|
|
// 8. Find the best algorithm
|
|
int returned_algo_count;
|
|
cudnnConvolutionFwdAlgoPerf_t algo_perfs[max_algo_count];
|
|
// cudnnFindConvolutionForwardAlgorithm(handle, ...);
|
|
|
|
// 9. Print algorithms for debugging
|
|
for (int i = 0; i < returned_algo_count; ++i) {
|
|
printf("Algorithm %d: name %s, time %f sec, memory %lu byte, status %s\n",
|
|
i, algo_to_string(algo_perfs[i].algo), algo_perfs[i].time,
|
|
algo_perfs[i].memory, cudnnGetErrorString(algo_perfs[i].status));
|
|
}
|
|
|
|
// 10. Save first algorithm
|
|
best_algo = algo_perfs[0];
|
|
|
|
// 11. Allocate GPU memory for input, filter, output, workspace
|
|
// cudaMalloc(&I_gpu, ...);
|
|
// cudaMalloc(&F_gpu, ...);
|
|
// cudaMalloc(&O_gpu, ...);
|
|
// cudaMalloc(&workspace, ...);
|
|
}
|
|
|
|
void convolution_cudnn(float *I, float *F, float *O, int N, int C, int H, int W,
|
|
int K, int R, int S, int pad_h, int pad_w, int stride_h,
|
|
int stride_w, int dilation_h, int dilation_w) {
|
|
// TODO: Implement here
|
|
|
|
// 1. Send input from CPU to GPU
|
|
// cudaMemcpy(I_gpu, I, ...);
|
|
|
|
// 2. Send filter from CPU to GPU
|
|
// cudaMemcpy(F_gpu, F, ...);
|
|
|
|
// 3. Run convolution
|
|
const float alpha = 1, beta = 0;
|
|
// cudnnConvolutionForward(handle, ...);
|
|
|
|
// 4. Send output from GPU to CPU
|
|
// cudaMemcpy(O, O_gpu, ...);
|
|
}
|
|
|
|
void convolution_cudnn_finalize(int N, int C, int H, int W, int K, int R, int S,
|
|
int pad_h, int pad_w, int stride_h,
|
|
int stride_w, int dilation_h, int dilation_w) {
|
|
// TODO: Implement here
|
|
|
|
// 1. Free GPU memory for input, filter, output, workspace
|
|
// cudaFree(...);
|
|
// cudaFree(...);
|
|
// cudaFree(...);
|
|
// cudaFree(...);
|
|
|
|
// 2. Destroy descriptors and handle
|
|
// cudnnDestroyTensorDescriptor(...);
|
|
// cudnnDestroyFilterDescriptor(...));
|
|
// cudnnDestroyConvolutionDescriptor(...);
|
|
// cudnnDestroyTensorDescriptor(...);
|
|
// cudnnDestroy(...);
|
|
}
|
|
|
|
const char *algo_to_string(cudnnConvolutionFwdAlgo_t algo) {
|
|
switch (algo) {
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_GEMM";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_FFT";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED";
|
|
case CUDNN_CONVOLUTION_FWD_ALGO_COUNT:
|
|
return "CUDNN_CONVOLUTION_FWD_ALGO_COUNT";
|
|
default: return "<unknown algorithm>";
|
|
}
|
|
} |