/* * Copyright (C) 2021 Pascal Nowack * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include "config.h" #include "grd-hwaccel-nvidia.h" #include #include "grd-egl-thread.h" #include "grd-utils.h" #define MAX_CUDA_DEVICES_FOR_RETRIEVAL 32 typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer (CUgraphicsResource *resource, GLuint buffer, unsigned int flags); typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer_v2 (CUdeviceptr *dev_ptr, size_t *size, CUgraphicsResource resource); typedef struct _ExtraCudaFunctions { tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; tcuGraphicsResourceGetMappedPointer_v2 *cuGraphicsResourceGetMappedPointer; } ExtraCudaFunctions; typedef struct _DevRetrievalData { GrdHwAccelNvidia *hwaccel_nvidia; GrdSyncPoint sync_point; unsigned int n_devices; CUdevice *devices; } DevRetrievalData; typedef struct _NvEncEncodeSession { GrdHwAccelNvidia *hwaccel_nvidia; void *encoder; uint32_t encode_width; uint32_t encode_height; NV_ENC_OUTPUT_PTR buffer_out; NV_ENC_REGISTERED_PTR registered_resource; NV_ENC_INPUT_PTR mapped_resource; } NvEncEncodeSession; struct _GrdHwAccelNvidia { GObject parent; GrdEglThread *egl_thread; CudaFunctions *cuda_funcs; NvencFunctions *nvenc_funcs; NV_ENCODE_API_FUNCTION_LIST nvenc_api; void *cuda_lib; ExtraCudaFunctions *extra_cuda_funcs; CUdevice cu_device; CUcontext cu_context; gboolean initialized; CUmodule cu_module_dmg_utils; CUfunction cu_chk_dmg_pxl; CUfunction cu_cmb_dmg_arr_cols; CUfunction cu_cmb_dmg_arr_rows; CUfunction cu_simplify_dmg_arr; CUmodule cu_module_avc_utils; CUfunction cu_bgrx_to_yuv420; GHashTable *encode_sessions; uint32_t next_encode_session_id; }; G_DEFINE_TYPE (GrdHwAccelNvidia, grd_hwaccel_nvidia, G_TYPE_OBJECT) static void nvenc_encode_session_free (NvEncEncodeSession *encode_session); G_DEFINE_AUTOPTR_CLEANUP_FUNC (NvEncEncodeSession, nvenc_encode_session_free) static NvEncEncodeSession * nvenc_encode_session_new (GrdHwAccelNvidia *hwaccel_nvidia, uint32_t encode_width, uint32_t encode_height) { NvEncEncodeSession *encode_session; encode_session = g_new0 (NvEncEncodeSession, 1); encode_session->hwaccel_nvidia = hwaccel_nvidia; encode_session->encode_width = encode_width; encode_session->encode_height = encode_height; return encode_session; } static void nvenc_encode_session_free (NvEncEncodeSession *encode_session) { GrdHwAccelNvidia *hwaccel_nvidia = encode_session->hwaccel_nvidia; NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api; if (encode_session->mapped_resource) { nvenc_api->nvEncUnmapInputResource (encode_session->encoder, encode_session->mapped_resource); } if (encode_session->registered_resource) { nvenc_api->nvEncUnregisterResource (encode_session->encoder, encode_session->registered_resource); } if (encode_session->buffer_out) { nvenc_api->nvEncDestroyBitstreamBuffer (encode_session->encoder, encode_session->buffer_out); } if (encode_session->encoder) { NV_ENC_PIC_PARAMS pic_params = {}; pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS; nvenc_api->nvEncEncodePicture (encode_session->encoder, &pic_params); nvenc_api->nvEncDestroyEncoder (encode_session->encoder); } g_free (encode_session); } void grd_hwaccel_nvidia_get_cuda_functions (GrdHwAccelNvidia *hwaccel_nvidia, gpointer *cuda_funcs) { *cuda_funcs = hwaccel_nvidia->cuda_funcs; } void grd_hwaccel_nvidia_get_cuda_damage_kernels (GrdHwAccelNvidia *hwaccel_nvidia, CUfunction *cu_chk_dmg_pxl, CUfunction *cu_cmb_dmg_arr_cols, CUfunction *cu_cmb_dmg_arr_rows, CUfunction *cu_simplify_dmg_arr) { *cu_chk_dmg_pxl = hwaccel_nvidia->cu_chk_dmg_pxl; *cu_cmb_dmg_arr_cols = hwaccel_nvidia->cu_cmb_dmg_arr_cols; *cu_cmb_dmg_arr_rows = hwaccel_nvidia->cu_cmb_dmg_arr_rows; *cu_simplify_dmg_arr = hwaccel_nvidia->cu_simplify_dmg_arr; } static const char * get_cuda_error_string (GrdHwAccelNvidia *hwaccel_nvidia, CUresult cu_result) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; const char *error_string = NULL; CUresult local_cu_result; g_assert (cuda_funcs); g_assert (cuda_funcs->cuGetErrorString); local_cu_result = cuda_funcs->cuGetErrorString (cu_result, &error_string); if (G_UNLIKELY (local_cu_result != CUDA_SUCCESS)) g_warning ("[HWAccel.CUDA] cuGetErrorString() failed: %i", local_cu_result); return error_string; } void grd_hwaccel_nvidia_push_cuda_context (GrdHwAccelNvidia *hwaccel_nvidia) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuCtxPushCurrent (hwaccel_nvidia->cu_context); if (cu_result != CUDA_SUCCESS) { g_error ("[HWAccel.CUDA] Failed to push CUDA context: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); } } void grd_hwaccel_nvidia_pop_cuda_context (GrdHwAccelNvidia *hwaccel_nvidia) { CUcontext cu_context; hwaccel_nvidia->cuda_funcs->cuCtxPopCurrent (&cu_context); } gboolean grd_hwaccel_nvidia_register_read_only_gl_buffer (GrdHwAccelNvidia *hwaccel_nvidia, CUgraphicsResource *cuda_resource, uint32_t buffer) { ExtraCudaFunctions *extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs; CUresult cu_result; cu_result = extra_cuda_funcs->cuGraphicsGLRegisterBuffer (cuda_resource, buffer, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to register GL buffer: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } return TRUE; } void grd_hwaccel_nvidia_unregister_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia, CUgraphicsResource cuda_resource, CUstream cuda_stream) { hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cuda_stream); hwaccel_nvidia->cuda_funcs->cuGraphicsUnregisterResource (cuda_resource); } gboolean grd_hwaccel_nvidia_map_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia, CUgraphicsResource cuda_resource, CUdeviceptr *dev_ptr, size_t *size, CUstream cuda_stream) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; ExtraCudaFunctions *extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuGraphicsMapResources (1, &cuda_resource, cuda_stream); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to map resources: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } cu_result = extra_cuda_funcs->cuGraphicsResourceGetMappedPointer (dev_ptr, size, cuda_resource); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to get mapped pointer: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); cuda_funcs->cuGraphicsUnmapResources (1, &cuda_resource, cuda_stream); return FALSE; } return TRUE; } void grd_hwaccel_nvidia_unmap_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia, CUgraphicsResource cuda_resource, CUstream cuda_stream) { hwaccel_nvidia->cuda_funcs->cuGraphicsUnmapResources (1, &cuda_resource, cuda_stream); } gboolean grd_hwaccel_nvidia_create_cuda_stream (GrdHwAccelNvidia *hwaccel_nvidia, CUstream *cuda_stream) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuStreamCreate (cuda_stream, CU_STREAM_NON_BLOCKING); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to create stream: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } return TRUE; } void grd_hwaccel_nvidia_destroy_cuda_stream (GrdHwAccelNvidia *hwaccel_nvidia, CUstream cuda_stream) { hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cuda_stream); hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cuda_stream); } gboolean grd_hwaccel_nvidia_alloc_mem (GrdHwAccelNvidia *hwaccel_nvidia, CUdeviceptr *device_ptr, size_t size) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuMemAlloc (device_ptr, size); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to allocate memory: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } return TRUE; } void grd_hwaccel_nvidia_clear_mem_ptr (GrdHwAccelNvidia *hwaccel_nvidia, CUdeviceptr *device_ptr) { if (!(*device_ptr)) return; hwaccel_nvidia->cuda_funcs->cuMemFree (*device_ptr); *device_ptr = 0; } static uint32_t get_next_free_encode_session_id (GrdHwAccelNvidia *hwaccel_nvidia) { uint32_t encode_session_id = hwaccel_nvidia->next_encode_session_id; while (g_hash_table_contains (hwaccel_nvidia->encode_sessions, GUINT_TO_POINTER (encode_session_id))) ++encode_session_id; hwaccel_nvidia->next_encode_session_id = encode_session_id + 1; return encode_session_id; } static const char * get_last_nvenc_error_string (GrdHwAccelNvidia *hwaccel_nvidia, void *encoder) { NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api; g_assert (nvenc_api); g_assert (nvenc_api->nvEncGetLastErrorString); return nvenc_api->nvEncGetLastErrorString (encoder); } gboolean grd_hwaccel_nvidia_create_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia, uint32_t *encode_session_id, uint16_t surface_width, uint16_t surface_height, uint16_t *aligned_width, uint16_t *aligned_height, uint16_t refresh_rate) { g_autoptr (NvEncEncodeSession) encode_session = NULL; NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS open_params = {0}; NV_ENC_INITIALIZE_PARAMS init_params = {0}; NV_ENC_CONFIG encode_config = {0}; NV_ENC_CREATE_BITSTREAM_BUFFER create_bitstream_buffer = {0}; *aligned_width = grd_get_aligned_size (surface_width, 16); *aligned_height = grd_get_aligned_size (surface_height, 64); *encode_session_id = get_next_free_encode_session_id (hwaccel_nvidia); encode_session = nvenc_encode_session_new (hwaccel_nvidia, *aligned_width, *aligned_height); open_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; open_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA; open_params.device = hwaccel_nvidia->cu_context; open_params.apiVersion = NVENCAPI_VERSION; if (hwaccel_nvidia->nvenc_api.nvEncOpenEncodeSessionEx ( &open_params, &encode_session->encoder) != NV_ENC_SUCCESS) { g_debug ("[HWAccel.NVENC] Failed to open encode session: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); return FALSE; } encode_config.version = NV_ENC_CONFIG_VER; encode_config.profileGUID = NV_ENC_H264_PROFILE_PROGRESSIVE_HIGH_GUID; encode_config.gopLength = NVENC_INFINITE_GOPLENGTH; encode_config.frameIntervalP = 1; encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; encode_config.mvPrecision = NV_ENC_MV_PRECISION_QUARTER_PEL; encode_config.rcParams.version = NV_ENC_RC_PARAMS_VER; encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR; encode_config.rcParams.averageBitRate = 0; encode_config.rcParams.maxBitRate = 0; encode_config.rcParams.targetQuality = 22; encode_config.encodeCodecConfig.h264Config.idrPeriod = NVENC_INFINITE_GOPLENGTH; encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1; init_params.version = NV_ENC_INITIALIZE_PARAMS_VER; init_params.encodeGUID = NV_ENC_CODEC_H264_GUID; init_params.encodeWidth = *aligned_width; init_params.encodeHeight = *aligned_height; init_params.darWidth = surface_width; init_params.darHeight = surface_height; init_params.frameRateNum = refresh_rate; init_params.frameRateDen = 1; init_params.enablePTD = 1; init_params.encodeConfig = &encode_config; if (hwaccel_nvidia->nvenc_api.nvEncInitializeEncoder ( encode_session->encoder, &init_params) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to initialize encoder: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); return FALSE; } create_bitstream_buffer.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; if (hwaccel_nvidia->nvenc_api.nvEncCreateBitstreamBuffer ( encode_session->encoder, &create_bitstream_buffer) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to create bitstream buffer: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); return FALSE; } encode_session->buffer_out = create_bitstream_buffer.bitstreamBuffer; g_hash_table_insert (hwaccel_nvidia->encode_sessions, GUINT_TO_POINTER (*encode_session_id), g_steal_pointer (&encode_session)); return TRUE; } void grd_hwaccel_nvidia_free_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia, uint32_t encode_session_id) { NvEncEncodeSession *encode_session = NULL; if (!g_hash_table_steal_extended (hwaccel_nvidia->encode_sessions, GUINT_TO_POINTER (encode_session_id), NULL, (gpointer *) &encode_session)) g_assert_not_reached (); g_assert (encode_session); nvenc_encode_session_free (encode_session); } gboolean grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia *hwaccel_nvidia, uint32_t encode_session_id, CUdeviceptr src_data, CUdeviceptr *main_view_nv12, uint16_t src_width, uint16_t src_height, uint16_t aligned_width, uint16_t aligned_height, CUstream cuda_stream) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; NvEncEncodeSession *encode_session; NV_ENC_REGISTER_RESOURCE register_res = {0}; NV_ENC_MAP_INPUT_RESOURCE map_input_res = {0}; NV_ENC_PIC_PARAMS pic_params = {0}; unsigned int grid_dim_x, grid_dim_y, grid_dim_z; unsigned int block_dim_x, block_dim_y, block_dim_z; void *args[7]; CUresult cu_result; if (!g_hash_table_lookup_extended (hwaccel_nvidia->encode_sessions, GUINT_TO_POINTER (encode_session_id), NULL, (gpointer *) &encode_session)) return FALSE; g_assert (encode_session->encode_width == aligned_width); g_assert (encode_session->encode_height == aligned_height); g_assert (encode_session->mapped_resource == NULL); g_assert (encode_session->registered_resource == NULL); if (!(*main_view_nv12) && !grd_hwaccel_nvidia_alloc_mem (hwaccel_nvidia, main_view_nv12, aligned_width * (aligned_height + aligned_height / 2))) return FALSE; /* Threads per blocks */ block_dim_x = 16; block_dim_y = 16; block_dim_z = 1; /* Amount of blocks per grid */ grid_dim_x = aligned_width / 2 / block_dim_x + (aligned_width / 2 % block_dim_x ? 1 : 0); grid_dim_y = aligned_height / 2 / block_dim_y + (aligned_height / 2 % block_dim_y ? 1 : 0); grid_dim_z = 1; args[0] = main_view_nv12; args[1] = &src_data; args[2] = &src_width; args[3] = &src_height; args[4] = &aligned_width; args[5] = &aligned_height; args[6] = &aligned_width; cu_result = cuda_funcs->cuLaunchKernel (hwaccel_nvidia->cu_bgrx_to_yuv420, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, block_dim_z, 0, cuda_stream, args, NULL); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to launch BGRX_TO_YUV420 kernel: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } cu_result = cuda_funcs->cuStreamSynchronize (cuda_stream); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to synchronize stream: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } register_res.version = NV_ENC_REGISTER_RESOURCE_VER; register_res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; register_res.width = aligned_width; register_res.height = aligned_height; register_res.pitch = aligned_width; register_res.resourceToRegister = (void *) *main_view_nv12; register_res.bufferFormat = NV_ENC_BUFFER_FORMAT_NV12; register_res.bufferUsage = NV_ENC_INPUT_IMAGE; if (hwaccel_nvidia->nvenc_api.nvEncRegisterResource ( encode_session->encoder, ®ister_res) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to register resource: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); return FALSE; } map_input_res.version = NV_ENC_MAP_INPUT_RESOURCE_VER; map_input_res.registeredResource = register_res.registeredResource; if (hwaccel_nvidia->nvenc_api.nvEncMapInputResource ( encode_session->encoder, &map_input_res) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to map input resource: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); hwaccel_nvidia->nvenc_api.nvEncUnregisterResource (encode_session->encoder, register_res.registeredResource); return FALSE; } pic_params.version = NV_ENC_PIC_PARAMS_VER; pic_params.inputWidth = aligned_width; pic_params.inputHeight = aligned_height; pic_params.inputPitch = aligned_width; pic_params.inputBuffer = map_input_res.mappedResource; pic_params.outputBitstream = encode_session->buffer_out; pic_params.bufferFmt = map_input_res.mappedBufferFmt; pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; if (hwaccel_nvidia->nvenc_api.nvEncEncodePicture ( encode_session->encoder, &pic_params) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to encode frame: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); hwaccel_nvidia->nvenc_api.nvEncUnmapInputResource (encode_session->encoder, map_input_res.mappedResource); hwaccel_nvidia->nvenc_api.nvEncUnregisterResource (encode_session->encoder, register_res.registeredResource); return FALSE; } encode_session->mapped_resource = map_input_res.mappedResource; encode_session->registered_resource = register_res.registeredResource; return TRUE; } gboolean grd_hwaccel_nvidia_avc420_retrieve_bitstream (GrdHwAccelNvidia *hwaccel_nvidia, uint32_t encode_session_id, uint8_t **bitstream, uint32_t *bitstream_size) { NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api; NvEncEncodeSession *encode_session; NV_ENC_LOCK_BITSTREAM lock_bitstream = {0}; gboolean success = FALSE; if (!g_hash_table_lookup_extended (hwaccel_nvidia->encode_sessions, GUINT_TO_POINTER (encode_session_id), NULL, (gpointer *) &encode_session)) g_assert_not_reached (); g_assert (encode_session->mapped_resource != NULL); g_assert (encode_session->registered_resource != NULL); lock_bitstream.version = NV_ENC_LOCK_BITSTREAM_VER; lock_bitstream.outputBitstream = encode_session->buffer_out; if (nvenc_api->nvEncLockBitstream (encode_session->encoder, &lock_bitstream) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Failed to lock bitstream: %s", get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder)); goto out; } if (bitstream_size) *bitstream_size = lock_bitstream.bitstreamSizeInBytes; if (bitstream) *bitstream = g_memdup2 (lock_bitstream.bitstreamBufferPtr, *bitstream_size); nvenc_api->nvEncUnlockBitstream (encode_session->encoder, lock_bitstream.outputBitstream); success = TRUE; out: nvenc_api->nvEncUnmapInputResource (encode_session->encoder, encode_session->mapped_resource); nvenc_api->nvEncUnregisterResource (encode_session->encoder, encode_session->registered_resource); encode_session->mapped_resource = NULL; encode_session->registered_resource = NULL; return success; } static gboolean load_extra_cuda_functions (GrdHwAccelNvidia *hwaccel_nvidia) { ExtraCudaFunctions *extra_cuda_funcs; hwaccel_nvidia->cuda_lib = dlopen ("libcuda.so.1", RTLD_LAZY); if (!hwaccel_nvidia->cuda_lib) return FALSE; hwaccel_nvidia->extra_cuda_funcs = g_malloc0 (sizeof (ExtraCudaFunctions)); extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs; extra_cuda_funcs->cuGraphicsGLRegisterBuffer = dlsym (hwaccel_nvidia->cuda_lib, "cuGraphicsGLRegisterBuffer"); if (!extra_cuda_funcs->cuGraphicsGLRegisterBuffer) return FALSE; extra_cuda_funcs->cuGraphicsResourceGetMappedPointer = dlsym (hwaccel_nvidia->cuda_lib, "cuGraphicsResourceGetMappedPointer_v2"); if (!extra_cuda_funcs->cuGraphicsGLRegisterBuffer) return FALSE; return TRUE; } static gboolean get_cuda_devices_in_impl (gpointer user_data) { DevRetrievalData *data = user_data; GrdHwAccelNvidia *hwaccel_nvidia = data->hwaccel_nvidia; CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; return cuda_funcs->cuGLGetDevices (&data->n_devices, data->devices, MAX_CUDA_DEVICES_FOR_RETRIEVAL, CU_GL_DEVICE_LIST_ALL) == CUDA_SUCCESS; } static void compute_devices_ready (gboolean success, gpointer user_data) { GrdSyncPoint *sync_point = user_data; grd_sync_point_complete (sync_point, success); } static gboolean get_cuda_devices_from_gl_context (GrdHwAccelNvidia *hwaccel_nvidia, GrdEglThread *egl_thread, unsigned int *n_returned_devices, CUdevice *device_array) { DevRetrievalData data = {}; gboolean success; grd_sync_point_init (&data.sync_point); data.hwaccel_nvidia = hwaccel_nvidia; data.devices = device_array; grd_egl_thread_run_custom_task (egl_thread, get_cuda_devices_in_impl, &data, compute_devices_ready, &data.sync_point, NULL); success = grd_sync_point_wait_for_completion (&data.sync_point); grd_sync_point_clear (&data.sync_point); *n_returned_devices = data.n_devices; return success; } static gboolean push_cuda_context_in_egl_thread (gpointer user_data) { GrdHwAccelNvidia *hwaccel_nvidia = user_data; grd_hwaccel_nvidia_push_cuda_context (hwaccel_nvidia); return TRUE; } static gboolean pop_cuda_context_in_egl_thread (gpointer user_data) { GrdHwAccelNvidia *hwaccel_nvidia = user_data; grd_hwaccel_nvidia_pop_cuda_context (hwaccel_nvidia); return TRUE; } static void complete_sync (gboolean success, gpointer user_data) { GrdSyncPoint *sync_point = user_data; grd_sync_point_complete (sync_point, success); } static void run_function_in_egl_thread (GrdHwAccelNvidia *hwaccel_nvidia, GrdEglThreadCustomFunc function) { GrdSyncPoint sync_point = {}; grd_sync_point_init (&sync_point); grd_egl_thread_run_custom_task (hwaccel_nvidia->egl_thread, function, hwaccel_nvidia, complete_sync, &sync_point, NULL); grd_sync_point_wait_for_completion (&sync_point); grd_sync_point_clear (&sync_point); } static gboolean load_cuda_module (GrdHwAccelNvidia *hwaccel_nvidia, CUmodule *module, const char *name, const char *ptx_instructions) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuModuleLoadData (module, ptx_instructions); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to load %s module: %s", name, get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } return TRUE; } static gboolean load_cuda_function (GrdHwAccelNvidia *hwaccel_nvidia, CUfunction *function, CUmodule module, const char *name) { CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs; CUresult cu_result; cu_result = cuda_funcs->cuModuleGetFunction (function, module, name); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to get kernel %s: %s", name, get_cuda_error_string (hwaccel_nvidia, cu_result)); return FALSE; } return TRUE; } GrdHwAccelNvidia * grd_hwaccel_nvidia_new (GrdEglThread *egl_thread) { g_autoptr (GrdHwAccelNvidia) hwaccel_nvidia = NULL; gboolean cuda_device_found = FALSE; CUdevice cu_devices[MAX_CUDA_DEVICES_FOR_RETRIEVAL] = {}; CUdevice cu_device = 0; unsigned int cu_device_count = 0; CudaFunctions *cuda_funcs; NvencFunctions *nvenc_funcs; g_autofree char *dmg_ptx_path = NULL; g_autofree char *dmg_ptx_instructions = NULL; g_autofree char *avc_ptx_path = NULL; g_autofree char *avc_ptx_instructions = NULL; g_autoptr (GError) error = NULL; CUresult cu_result; unsigned int i; hwaccel_nvidia = g_object_new (GRD_TYPE_HWACCEL_NVIDIA, NULL); hwaccel_nvidia->egl_thread = egl_thread; cuda_load_functions (&hwaccel_nvidia->cuda_funcs, NULL); nvenc_load_functions (&hwaccel_nvidia->nvenc_funcs, NULL); if (!hwaccel_nvidia->cuda_funcs || !hwaccel_nvidia->nvenc_funcs) { g_debug ("[HWAccel.CUDA] Failed to load CUDA or NVENC library"); return NULL; } if (!load_extra_cuda_functions (hwaccel_nvidia)) { g_warning ("[HWAccel.CUDA] Failed to load extra CUDA functions"); return NULL; } cuda_funcs = hwaccel_nvidia->cuda_funcs; nvenc_funcs = hwaccel_nvidia->nvenc_funcs; cu_result = cuda_funcs->cuInit (0); if (cu_result != CUDA_SUCCESS) { g_debug ("[HWAccel.CUDA] Failed to initialize CUDA: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return NULL; } if (!get_cuda_devices_from_gl_context (hwaccel_nvidia, egl_thread, &cu_device_count, cu_devices)) { g_message ("[HWAccel.CUDA] Unable to retrieve CUDA devices"); return NULL; } g_debug ("[HWAccel.CUDA] Retrieved %u CUDA device(s)", cu_device_count); for (i = 0; i < cu_device_count; ++i) { int cc_major = 0, cc_minor = 0; cu_device = cu_devices[i]; cu_result = cuda_funcs->cuDeviceGetAttribute (&cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to get device attribute: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); continue; } cu_result = cuda_funcs->cuDeviceGetAttribute (&cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to get device attribute: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); continue; } g_debug ("[HWAccel.CUDA] Device %u compute capability: [%i, %i]", i, cc_major, cc_minor); if (cc_major >= 3) { g_debug ("[HWAccel.CUDA] Choosing CUDA device with id %u", i); cuda_device_found = TRUE; break; } } if (!cu_device_count || !cuda_device_found) { g_debug ("[HWAccel.CUDA] No appropriate CUDA capable gpu found"); return NULL; } hwaccel_nvidia->cu_device = cu_device; cu_result = cuda_funcs->cuDevicePrimaryCtxRetain (&hwaccel_nvidia->cu_context, hwaccel_nvidia->cu_device); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to retain CUDA context: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); return NULL; } hwaccel_nvidia->nvenc_api.version = NV_ENCODE_API_FUNCTION_LIST_VER; if (nvenc_funcs->NvEncodeAPICreateInstance (&hwaccel_nvidia->nvenc_api) != NV_ENC_SUCCESS) { g_warning ("[HWAccel.NVENC] Could not create NVENC API instance"); cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device); return NULL; } cu_result = cuda_funcs->cuCtxPushCurrent (hwaccel_nvidia->cu_context); if (cu_result != CUDA_SUCCESS) { g_warning ("[HWAccel.CUDA] Failed to push CUDA context: %s", get_cuda_error_string (hwaccel_nvidia, cu_result)); cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device); return NULL; } run_function_in_egl_thread (hwaccel_nvidia, push_cuda_context_in_egl_thread); hwaccel_nvidia->initialized = TRUE; dmg_ptx_path = g_strdup_printf ("%s/grd-cuda-damage-utils_30.ptx", GRD_DATA_DIR); avc_ptx_path = g_strdup_printf ("%s/grd-cuda-avc-utils_30.ptx", GRD_DATA_DIR); if (!g_file_get_contents (dmg_ptx_path, &dmg_ptx_instructions, NULL, &error) || !g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error)) g_error ("[HWAccel.CUDA] Failed to read PTX instructions: %s", error->message); if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_dmg_utils, "damage utils", dmg_ptx_instructions)) return NULL; if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_chk_dmg_pxl, hwaccel_nvidia->cu_module_dmg_utils, "check_damaged_pixel") || !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_cols, hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_cols") || !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_rows, hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_rows") || !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_simplify_dmg_arr, hwaccel_nvidia->cu_module_dmg_utils, "simplify_damage_array")) return NULL; if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_avc_utils, "AVC utils", avc_ptx_instructions)) return NULL; if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_bgrx_to_yuv420, hwaccel_nvidia->cu_module_avc_utils, "convert_2x2_bgrx_area_to_yuv420_nv12")) return NULL; return g_steal_pointer (&hwaccel_nvidia); } static void grd_hwaccel_nvidia_dispose (GObject *object) { GrdHwAccelNvidia *hwaccel_nvidia = GRD_HWACCEL_NVIDIA (object); g_clear_pointer (&hwaccel_nvidia->cu_module_avc_utils, hwaccel_nvidia->cuda_funcs->cuModuleUnload); g_clear_pointer (&hwaccel_nvidia->cu_module_dmg_utils, hwaccel_nvidia->cuda_funcs->cuModuleUnload); if (hwaccel_nvidia->initialized) { run_function_in_egl_thread (hwaccel_nvidia, pop_cuda_context_in_egl_thread); hwaccel_nvidia->cuda_funcs->cuCtxPopCurrent (&hwaccel_nvidia->cu_context); hwaccel_nvidia->cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device); hwaccel_nvidia->initialized = FALSE; } g_clear_pointer (&hwaccel_nvidia->cuda_lib, dlclose); g_clear_pointer (&hwaccel_nvidia->extra_cuda_funcs, g_free); nvenc_free_functions (&hwaccel_nvidia->nvenc_funcs); cuda_free_functions (&hwaccel_nvidia->cuda_funcs); g_assert (!hwaccel_nvidia->encode_sessions || g_hash_table_size (hwaccel_nvidia->encode_sessions) == 0); g_clear_pointer (&hwaccel_nvidia->encode_sessions, g_hash_table_destroy); G_OBJECT_CLASS (grd_hwaccel_nvidia_parent_class)->dispose (object); } static void grd_hwaccel_nvidia_init (GrdHwAccelNvidia *hwaccel_nvidia) { hwaccel_nvidia->encode_sessions = g_hash_table_new (NULL, NULL); } static void grd_hwaccel_nvidia_class_init (GrdHwAccelNvidiaClass *klass) { GObjectClass *object_class = G_OBJECT_CLASS (klass); object_class->dispose = grd_hwaccel_nvidia_dispose; }