#include #include #include #include "convolution.h" #define CHECK_CUDA(call) \ do { \ cudaError_t status_ = call; \ if (status_ != cudaSuccess) { \ fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \ cudaGetErrorString(status_)); \ exit(EXIT_FAILURE); \ } \ } while (0) #define CHECK_CUDNN(call) \ do { \ cudnnStatus_t status_ = call; \ if (status_ != CUDNN_STATUS_SUCCESS) { \ fprintf(stderr, "CUDNN error (%s:%d): %s\n", __FILE__, __LINE__, \ cudnnGetErrorString(status_)); \ exit(EXIT_FAILURE); \ } \ } while (0) static cudnnHandle_t handle; static cudnnTensorDescriptor_t input_desc; static cudnnFilterDescriptor_t filter_desc; static cudnnConvolutionDescriptor_t conv_desc; static cudnnTensorDescriptor_t output_desc; static int ON, OC, OH, OW; static float *I_gpu, *F_gpu, *O_gpu, *workspace; static cudnnConvolutionFwdAlgoPerf_t best_algo; static const char *algo_to_string(cudnnConvolutionFwdAlgo_t algo); void convolution_cudnn_initialize(int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { // TODO: Implement here // 1. Setup handle CHECK_CUDNN(cudnnCreate(&handle)); // 2. Setup input_desc (NCHW, float) // cudnnCreateTensorDescriptor(...); // cudnnSetTensor4dDescriptor(input_desc, ...); // 3. Setup filter_desc (NCHW, float) // cudnnCreateFilterDescriptor(...); // cudnnSetFilter4dDescriptor(filter_desc, ...); // 4. Setup conv_desc // cudnnCreateConvolutionDescriptor(...); // cudnnSetConvolution2dDescriptor(conv_desc, ...); // 5. Find output tensor dimensions // cudnnGetConvolution2dForwardOutputDim(..., &ON, &OC, &OH, &OW); // 6. Setup output_desc (NCHW, float) // cudnnCreateTensorDescriptor(...); // cudnnSetTensor4dDescriptor(output_desc, ...); // 7. Find the number of algorithm available int max_algo_count; // cudnnGetConvolutionForwardAlgorithmMaxCount(handle, ...); // 8. Find the best algorithm int returned_algo_count; cudnnConvolutionFwdAlgoPerf_t algo_perfs[max_algo_count]; // cudnnFindConvolutionForwardAlgorithm(handle, ...); // 9. Print algorithms for debugging for (int i = 0; i < returned_algo_count; ++i) { printf("Algorithm %d: name %s, time %f sec, memory %lu byte, status %s\n", i, algo_to_string(algo_perfs[i].algo), algo_perfs[i].time, algo_perfs[i].memory, cudnnGetErrorString(algo_perfs[i].status)); } // 10. Save first algorithm best_algo = algo_perfs[0]; // 11. Allocate GPU memory for input, filter, output, workspace // cudaMalloc(&I_gpu, ...); // cudaMalloc(&F_gpu, ...); // cudaMalloc(&O_gpu, ...); // cudaMalloc(&workspace, ...); } void convolution_cudnn(float *I, float *F, float *O, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { // TODO: Implement here // 1. Send input from CPU to GPU // cudaMemcpy(I_gpu, I, ...); // 2. Send filter from CPU to GPU // cudaMemcpy(F_gpu, F, ...); // 3. Run convolution const float alpha = 1, beta = 0; // cudnnConvolutionForward(handle, ...); // 4. Send output from GPU to CPU // cudaMemcpy(O, O_gpu, ...); } void convolution_cudnn_finalize(int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { // TODO: Implement here // 1. Free GPU memory for input, filter, output, workspace // cudaFree(...); // cudaFree(...); // cudaFree(...); // cudaFree(...); // 2. Destroy descriptors and handle // cudnnDestroyTensorDescriptor(...); // cudnnDestroyFilterDescriptor(...)); // cudnnDestroyConvolutionDescriptor(...); // cudnnDestroyTensorDescriptor(...); // cudnnDestroy(...); } const char *algo_to_string(cudnnConvolutionFwdAlgo_t algo) { switch (algo) { case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM: return "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM"; case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: return "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM"; case CUDNN_CONVOLUTION_FWD_ALGO_GEMM: return "CUDNN_CONVOLUTION_FWD_ALGO_GEMM"; case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: return "CUDNN_CONVOLUTION_FWD_ALGO_DIRECT"; case CUDNN_CONVOLUTION_FWD_ALGO_FFT: return "CUDNN_CONVOLUTION_FWD_ALGO_FFT"; case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING: return "CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING"; case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD: return "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD"; case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED: return "CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED"; case CUDNN_CONVOLUTION_FWD_ALGO_COUNT: return "CUDNN_CONVOLUTION_FWD_ALGO_COUNT"; default: return ""; } }