#include #include #include #include #include "matmul.h" #define CHECK_CUDA(call) \ do { \ cudaError_t status_ = call; \ if (status_ != cudaSuccess) { \ fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, \ cudaGetErrorString(status_)); \ exit(EXIT_FAILURE); \ } \ } while (0) #define CHECK_CUBLAS(call) \ do { \ cublasStatus_t status_ = call; \ if (status_ != CUBLAS_STATUS_SUCCESS) { \ fprintf(stderr, "CUBLAS error (%s:%d): %s, %s\n", __FILE__, __LINE__, \ cublasGetStatusName(status_), cublasGetStatusString(status_)); \ exit(EXIT_FAILURE); \ } \ } while (0) static float *A_gpu, *B_gpu, *C_gpu; static cublasHandle_t handle; void matmul_cublas_initialize(size_t M, size_t N, size_t K) { // TODO: Implement here // 1. Create cublas handle CHECK_CUBLAS(cublasCreate(&handle)); // 2. Allocate GPU memory for A, B, C // cudaMalloc(&A_gpu, ...); // cudaMalloc(&B_gpu, ...); // cudaMalloc(&C_gpu, ...); } void matmul_cublas(float *A, float *B, float *C, size_t M, size_t N, size_t K) { // TODO: Implement here // 1. Send A from CPU to GPU // cublasSetMatrix(...); // 2. Send B from CPU to GPU // cublasSetMatrix(...); // 3. Run SGEMM const float one = 1, zero = 0; // cublasSgemm(handle, ...); // 4. Send C from GPU to CPU // cublasGetMatrix(...); } void matmul_cublas_finalize(size_t M, size_t N, size_t K) { // TODO: Implement here // 1. Free GPU memory for A, B, C // cudaFree(...); // cudaFree(...); // cudaFree(...); // 2. Destroy cublas handle // cublasDestroy(...); }