#pragma once void convolution( float *_input, float *_output, float *_weight, int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride); void convolution_init( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride); void convolution_final( int _N, int _C, int _H, int _W, int _K, int _R, int _S, int _pad, int _dilation, int _stride); void cuda_device_init(void); void cuda_device_malloc(void); void cuda_memcpy_host_to_device(void); void cuda_memcpy_device_to_host(void); void cuda_kernel_call(void); void print_filter(float *filter, int K, int C, int R, int S); void print_input(float *input, int N, int C, int H, int W);