mirror of
https://github.com/morgan9e/grd
synced 2026-04-14 00:14:18 +09:00
1018 lines
35 KiB
C
1018 lines
35 KiB
C
/*
|
|
* Copyright (C) 2021 Pascal Nowack
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of the
|
|
* License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
|
|
* 02111-1307, USA.
|
|
*/
|
|
|
|
#include "config.h"
|
|
|
|
#include "grd-hwaccel-nvidia.h"
|
|
|
|
#include <ffnvcodec/dynlink_loader.h>
|
|
|
|
#include "grd-egl-thread.h"
|
|
#include "grd-utils.h"
|
|
|
|
#define MAX_CUDA_DEVICES_FOR_RETRIEVAL 32
|
|
|
|
typedef CUresult CUDAAPI tcuGraphicsGLRegisterBuffer (CUgraphicsResource *resource,
|
|
GLuint buffer,
|
|
unsigned int flags);
|
|
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer_v2 (CUdeviceptr *dev_ptr,
|
|
size_t *size,
|
|
CUgraphicsResource resource);
|
|
|
|
typedef struct _ExtraCudaFunctions
|
|
{
|
|
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
|
|
tcuGraphicsResourceGetMappedPointer_v2 *cuGraphicsResourceGetMappedPointer;
|
|
} ExtraCudaFunctions;
|
|
|
|
typedef struct _DevRetrievalData
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia;
|
|
GrdSyncPoint sync_point;
|
|
|
|
unsigned int n_devices;
|
|
CUdevice *devices;
|
|
} DevRetrievalData;
|
|
|
|
typedef struct _NvEncEncodeSession
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia;
|
|
|
|
void *encoder;
|
|
|
|
uint32_t encode_width;
|
|
uint32_t encode_height;
|
|
|
|
NV_ENC_OUTPUT_PTR buffer_out;
|
|
|
|
NV_ENC_REGISTERED_PTR registered_resource;
|
|
NV_ENC_INPUT_PTR mapped_resource;
|
|
} NvEncEncodeSession;
|
|
|
|
struct _GrdHwAccelNvidia
|
|
{
|
|
GObject parent;
|
|
|
|
GrdEglThread *egl_thread;
|
|
|
|
CudaFunctions *cuda_funcs;
|
|
NvencFunctions *nvenc_funcs;
|
|
NV_ENCODE_API_FUNCTION_LIST nvenc_api;
|
|
|
|
void *cuda_lib;
|
|
ExtraCudaFunctions *extra_cuda_funcs;
|
|
|
|
CUdevice cu_device;
|
|
CUcontext cu_context;
|
|
gboolean initialized;
|
|
|
|
CUmodule cu_module_dmg_utils;
|
|
CUfunction cu_chk_dmg_pxl;
|
|
CUfunction cu_cmb_dmg_arr_cols;
|
|
CUfunction cu_cmb_dmg_arr_rows;
|
|
CUfunction cu_simplify_dmg_arr;
|
|
|
|
CUmodule cu_module_avc_utils;
|
|
CUfunction cu_bgrx_to_yuv420;
|
|
|
|
GHashTable *encode_sessions;
|
|
|
|
uint32_t next_encode_session_id;
|
|
};
|
|
|
|
G_DEFINE_TYPE (GrdHwAccelNvidia, grd_hwaccel_nvidia, G_TYPE_OBJECT)
|
|
|
|
static void
|
|
nvenc_encode_session_free (NvEncEncodeSession *encode_session);
|
|
|
|
G_DEFINE_AUTOPTR_CLEANUP_FUNC (NvEncEncodeSession, nvenc_encode_session_free)
|
|
|
|
static NvEncEncodeSession *
|
|
nvenc_encode_session_new (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
uint32_t encode_width,
|
|
uint32_t encode_height)
|
|
{
|
|
NvEncEncodeSession *encode_session;
|
|
|
|
encode_session = g_new0 (NvEncEncodeSession, 1);
|
|
encode_session->hwaccel_nvidia = hwaccel_nvidia;
|
|
encode_session->encode_width = encode_width;
|
|
encode_session->encode_height = encode_height;
|
|
|
|
return encode_session;
|
|
}
|
|
|
|
static void
|
|
nvenc_encode_session_free (NvEncEncodeSession *encode_session)
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia = encode_session->hwaccel_nvidia;
|
|
NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api;
|
|
|
|
if (encode_session->mapped_resource)
|
|
{
|
|
nvenc_api->nvEncUnmapInputResource (encode_session->encoder,
|
|
encode_session->mapped_resource);
|
|
}
|
|
if (encode_session->registered_resource)
|
|
{
|
|
nvenc_api->nvEncUnregisterResource (encode_session->encoder,
|
|
encode_session->registered_resource);
|
|
}
|
|
|
|
if (encode_session->buffer_out)
|
|
{
|
|
nvenc_api->nvEncDestroyBitstreamBuffer (encode_session->encoder,
|
|
encode_session->buffer_out);
|
|
}
|
|
|
|
if (encode_session->encoder)
|
|
{
|
|
NV_ENC_PIC_PARAMS pic_params = {};
|
|
|
|
pic_params.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
|
|
nvenc_api->nvEncEncodePicture (encode_session->encoder, &pic_params);
|
|
|
|
nvenc_api->nvEncDestroyEncoder (encode_session->encoder);
|
|
}
|
|
|
|
g_free (encode_session);
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_get_cuda_functions (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
gpointer *cuda_funcs)
|
|
{
|
|
*cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_get_cuda_damage_kernels (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUfunction *cu_chk_dmg_pxl,
|
|
CUfunction *cu_cmb_dmg_arr_cols,
|
|
CUfunction *cu_cmb_dmg_arr_rows,
|
|
CUfunction *cu_simplify_dmg_arr)
|
|
{
|
|
*cu_chk_dmg_pxl = hwaccel_nvidia->cu_chk_dmg_pxl;
|
|
*cu_cmb_dmg_arr_cols = hwaccel_nvidia->cu_cmb_dmg_arr_cols;
|
|
*cu_cmb_dmg_arr_rows = hwaccel_nvidia->cu_cmb_dmg_arr_rows;
|
|
*cu_simplify_dmg_arr = hwaccel_nvidia->cu_simplify_dmg_arr;
|
|
}
|
|
|
|
static const char *
|
|
get_cuda_error_string (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUresult cu_result)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
const char *error_string = NULL;
|
|
CUresult local_cu_result;
|
|
|
|
g_assert (cuda_funcs);
|
|
g_assert (cuda_funcs->cuGetErrorString);
|
|
|
|
local_cu_result = cuda_funcs->cuGetErrorString (cu_result, &error_string);
|
|
if (G_UNLIKELY (local_cu_result != CUDA_SUCCESS))
|
|
g_warning ("[HWAccel.CUDA] cuGetErrorString() failed: %i", local_cu_result);
|
|
|
|
return error_string;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_push_cuda_context (GrdHwAccelNvidia *hwaccel_nvidia)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuCtxPushCurrent (hwaccel_nvidia->cu_context);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_error ("[HWAccel.CUDA] Failed to push CUDA context: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
}
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_pop_cuda_context (GrdHwAccelNvidia *hwaccel_nvidia)
|
|
{
|
|
CUcontext cu_context;
|
|
|
|
hwaccel_nvidia->cuda_funcs->cuCtxPopCurrent (&cu_context);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_register_read_only_gl_buffer (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUgraphicsResource *cuda_resource,
|
|
uint32_t buffer)
|
|
{
|
|
ExtraCudaFunctions *extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result =
|
|
extra_cuda_funcs->cuGraphicsGLRegisterBuffer (cuda_resource, buffer,
|
|
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to register GL buffer: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_unregister_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUgraphicsResource cuda_resource,
|
|
CUstream cuda_stream)
|
|
{
|
|
hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cuda_stream);
|
|
hwaccel_nvidia->cuda_funcs->cuGraphicsUnregisterResource (cuda_resource);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_map_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUgraphicsResource cuda_resource,
|
|
CUdeviceptr *dev_ptr,
|
|
size_t *size,
|
|
CUstream cuda_stream)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
ExtraCudaFunctions *extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuGraphicsMapResources (1, &cuda_resource,
|
|
cuda_stream);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to map resources: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
cu_result = extra_cuda_funcs->cuGraphicsResourceGetMappedPointer (dev_ptr, size,
|
|
cuda_resource);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to get mapped pointer: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
cuda_funcs->cuGraphicsUnmapResources (1, &cuda_resource, cuda_stream);
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_unmap_cuda_resource (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUgraphicsResource cuda_resource,
|
|
CUstream cuda_stream)
|
|
{
|
|
hwaccel_nvidia->cuda_funcs->cuGraphicsUnmapResources (1, &cuda_resource,
|
|
cuda_stream);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_create_cuda_stream (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUstream *cuda_stream)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuStreamCreate (cuda_stream, CU_STREAM_NON_BLOCKING);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to create stream: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_destroy_cuda_stream (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUstream cuda_stream)
|
|
{
|
|
hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cuda_stream);
|
|
hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cuda_stream);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_alloc_mem (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUdeviceptr *device_ptr,
|
|
size_t size)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuMemAlloc (device_ptr, size);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to allocate memory: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_clear_mem_ptr (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUdeviceptr *device_ptr)
|
|
{
|
|
if (!(*device_ptr))
|
|
return;
|
|
|
|
hwaccel_nvidia->cuda_funcs->cuMemFree (*device_ptr);
|
|
*device_ptr = 0;
|
|
}
|
|
|
|
static uint32_t
|
|
get_next_free_encode_session_id (GrdHwAccelNvidia *hwaccel_nvidia)
|
|
{
|
|
uint32_t encode_session_id = hwaccel_nvidia->next_encode_session_id;
|
|
|
|
while (g_hash_table_contains (hwaccel_nvidia->encode_sessions,
|
|
GUINT_TO_POINTER (encode_session_id)))
|
|
++encode_session_id;
|
|
|
|
hwaccel_nvidia->next_encode_session_id = encode_session_id + 1;
|
|
|
|
return encode_session_id;
|
|
}
|
|
|
|
static const char *
|
|
get_last_nvenc_error_string (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
void *encoder)
|
|
{
|
|
NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api;
|
|
|
|
g_assert (nvenc_api);
|
|
g_assert (nvenc_api->nvEncGetLastErrorString);
|
|
|
|
return nvenc_api->nvEncGetLastErrorString (encoder);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_create_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
uint32_t *encode_session_id,
|
|
uint16_t surface_width,
|
|
uint16_t surface_height,
|
|
uint16_t *aligned_width,
|
|
uint16_t *aligned_height,
|
|
uint16_t refresh_rate)
|
|
{
|
|
g_autoptr (NvEncEncodeSession) encode_session = NULL;
|
|
NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS open_params = {0};
|
|
NV_ENC_INITIALIZE_PARAMS init_params = {0};
|
|
NV_ENC_CONFIG encode_config = {0};
|
|
NV_ENC_CREATE_BITSTREAM_BUFFER create_bitstream_buffer = {0};
|
|
|
|
*aligned_width = grd_get_aligned_size (surface_width, 16);
|
|
*aligned_height = grd_get_aligned_size (surface_height, 64);
|
|
|
|
*encode_session_id = get_next_free_encode_session_id (hwaccel_nvidia);
|
|
encode_session = nvenc_encode_session_new (hwaccel_nvidia,
|
|
*aligned_width, *aligned_height);
|
|
|
|
open_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
|
|
open_params.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
|
|
open_params.device = hwaccel_nvidia->cu_context;
|
|
open_params.apiVersion = NVENCAPI_VERSION;
|
|
|
|
if (hwaccel_nvidia->nvenc_api.nvEncOpenEncodeSessionEx (
|
|
&open_params, &encode_session->encoder) != NV_ENC_SUCCESS)
|
|
{
|
|
g_debug ("[HWAccel.NVENC] Failed to open encode session: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
return FALSE;
|
|
}
|
|
|
|
encode_config.version = NV_ENC_CONFIG_VER;
|
|
encode_config.profileGUID = NV_ENC_H264_PROFILE_PROGRESSIVE_HIGH_GUID;
|
|
encode_config.gopLength = NVENC_INFINITE_GOPLENGTH;
|
|
encode_config.frameIntervalP = 1;
|
|
encode_config.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME;
|
|
encode_config.mvPrecision = NV_ENC_MV_PRECISION_QUARTER_PEL;
|
|
encode_config.rcParams.version = NV_ENC_RC_PARAMS_VER;
|
|
encode_config.rcParams.rateControlMode = NV_ENC_PARAMS_RC_VBR;
|
|
encode_config.rcParams.averageBitRate = 0;
|
|
encode_config.rcParams.maxBitRate = 0;
|
|
encode_config.rcParams.targetQuality = 22;
|
|
encode_config.encodeCodecConfig.h264Config.idrPeriod = NVENC_INFINITE_GOPLENGTH;
|
|
encode_config.encodeCodecConfig.h264Config.chromaFormatIDC = 1;
|
|
|
|
init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
|
|
init_params.encodeGUID = NV_ENC_CODEC_H264_GUID;
|
|
init_params.encodeWidth = *aligned_width;
|
|
init_params.encodeHeight = *aligned_height;
|
|
init_params.darWidth = surface_width;
|
|
init_params.darHeight = surface_height;
|
|
init_params.frameRateNum = refresh_rate;
|
|
init_params.frameRateDen = 1;
|
|
init_params.enablePTD = 1;
|
|
init_params.encodeConfig = &encode_config;
|
|
if (hwaccel_nvidia->nvenc_api.nvEncInitializeEncoder (
|
|
encode_session->encoder, &init_params) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to initialize encoder: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
return FALSE;
|
|
}
|
|
|
|
create_bitstream_buffer.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
|
|
if (hwaccel_nvidia->nvenc_api.nvEncCreateBitstreamBuffer (
|
|
encode_session->encoder, &create_bitstream_buffer) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to create bitstream buffer: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
return FALSE;
|
|
}
|
|
encode_session->buffer_out = create_bitstream_buffer.bitstreamBuffer;
|
|
|
|
g_hash_table_insert (hwaccel_nvidia->encode_sessions,
|
|
GUINT_TO_POINTER (*encode_session_id),
|
|
g_steal_pointer (&encode_session));
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
void
|
|
grd_hwaccel_nvidia_free_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
uint32_t encode_session_id)
|
|
{
|
|
NvEncEncodeSession *encode_session = NULL;
|
|
|
|
if (!g_hash_table_steal_extended (hwaccel_nvidia->encode_sessions,
|
|
GUINT_TO_POINTER (encode_session_id),
|
|
NULL, (gpointer *) &encode_session))
|
|
g_assert_not_reached ();
|
|
|
|
g_assert (encode_session);
|
|
nvenc_encode_session_free (encode_session);
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
uint32_t encode_session_id,
|
|
CUdeviceptr src_data,
|
|
CUdeviceptr *main_view_nv12,
|
|
uint16_t src_width,
|
|
uint16_t src_height,
|
|
uint16_t aligned_width,
|
|
uint16_t aligned_height,
|
|
CUstream cuda_stream)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
NvEncEncodeSession *encode_session;
|
|
NV_ENC_REGISTER_RESOURCE register_res = {0};
|
|
NV_ENC_MAP_INPUT_RESOURCE map_input_res = {0};
|
|
NV_ENC_PIC_PARAMS pic_params = {0};
|
|
unsigned int grid_dim_x, grid_dim_y, grid_dim_z;
|
|
unsigned int block_dim_x, block_dim_y, block_dim_z;
|
|
void *args[7];
|
|
CUresult cu_result;
|
|
|
|
if (!g_hash_table_lookup_extended (hwaccel_nvidia->encode_sessions,
|
|
GUINT_TO_POINTER (encode_session_id),
|
|
NULL, (gpointer *) &encode_session))
|
|
return FALSE;
|
|
|
|
g_assert (encode_session->encode_width == aligned_width);
|
|
g_assert (encode_session->encode_height == aligned_height);
|
|
|
|
g_assert (encode_session->mapped_resource == NULL);
|
|
g_assert (encode_session->registered_resource == NULL);
|
|
|
|
if (!(*main_view_nv12) &&
|
|
!grd_hwaccel_nvidia_alloc_mem (hwaccel_nvidia, main_view_nv12,
|
|
aligned_width * (aligned_height + aligned_height / 2)))
|
|
return FALSE;
|
|
|
|
/* Threads per blocks */
|
|
block_dim_x = 16;
|
|
block_dim_y = 16;
|
|
block_dim_z = 1;
|
|
/* Amount of blocks per grid */
|
|
grid_dim_x = aligned_width / 2 / block_dim_x +
|
|
(aligned_width / 2 % block_dim_x ? 1 : 0);
|
|
grid_dim_y = aligned_height / 2 / block_dim_y +
|
|
(aligned_height / 2 % block_dim_y ? 1 : 0);
|
|
grid_dim_z = 1;
|
|
|
|
args[0] = main_view_nv12;
|
|
args[1] = &src_data;
|
|
args[2] = &src_width;
|
|
args[3] = &src_height;
|
|
args[4] = &aligned_width;
|
|
args[5] = &aligned_height;
|
|
args[6] = &aligned_width;
|
|
|
|
cu_result = cuda_funcs->cuLaunchKernel (hwaccel_nvidia->cu_bgrx_to_yuv420,
|
|
grid_dim_x, grid_dim_y, grid_dim_z,
|
|
block_dim_x, block_dim_y, block_dim_z,
|
|
0, cuda_stream, args, NULL);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to launch BGRX_TO_YUV420 kernel: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
cu_result = cuda_funcs->cuStreamSynchronize (cuda_stream);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to synchronize stream: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
register_res.version = NV_ENC_REGISTER_RESOURCE_VER;
|
|
register_res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
|
|
register_res.width = aligned_width;
|
|
register_res.height = aligned_height;
|
|
register_res.pitch = aligned_width;
|
|
register_res.resourceToRegister = (void *) *main_view_nv12;
|
|
register_res.bufferFormat = NV_ENC_BUFFER_FORMAT_NV12;
|
|
register_res.bufferUsage = NV_ENC_INPUT_IMAGE;
|
|
|
|
if (hwaccel_nvidia->nvenc_api.nvEncRegisterResource (
|
|
encode_session->encoder, ®ister_res) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to register resource: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
return FALSE;
|
|
}
|
|
|
|
map_input_res.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
|
|
map_input_res.registeredResource = register_res.registeredResource;
|
|
|
|
if (hwaccel_nvidia->nvenc_api.nvEncMapInputResource (
|
|
encode_session->encoder, &map_input_res) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to map input resource: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
hwaccel_nvidia->nvenc_api.nvEncUnregisterResource (encode_session->encoder,
|
|
register_res.registeredResource);
|
|
return FALSE;
|
|
}
|
|
|
|
pic_params.version = NV_ENC_PIC_PARAMS_VER;
|
|
pic_params.inputWidth = aligned_width;
|
|
pic_params.inputHeight = aligned_height;
|
|
pic_params.inputPitch = aligned_width;
|
|
pic_params.inputBuffer = map_input_res.mappedResource;
|
|
pic_params.outputBitstream = encode_session->buffer_out;
|
|
pic_params.bufferFmt = map_input_res.mappedBufferFmt;
|
|
pic_params.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
|
|
|
|
if (hwaccel_nvidia->nvenc_api.nvEncEncodePicture (
|
|
encode_session->encoder, &pic_params) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to encode frame: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
hwaccel_nvidia->nvenc_api.nvEncUnmapInputResource (encode_session->encoder,
|
|
map_input_res.mappedResource);
|
|
hwaccel_nvidia->nvenc_api.nvEncUnregisterResource (encode_session->encoder,
|
|
register_res.registeredResource);
|
|
return FALSE;
|
|
}
|
|
|
|
encode_session->mapped_resource = map_input_res.mappedResource;
|
|
encode_session->registered_resource = register_res.registeredResource;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
gboolean
|
|
grd_hwaccel_nvidia_avc420_retrieve_bitstream (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
uint32_t encode_session_id,
|
|
uint8_t **bitstream,
|
|
uint32_t *bitstream_size)
|
|
{
|
|
NV_ENCODE_API_FUNCTION_LIST *nvenc_api = &hwaccel_nvidia->nvenc_api;
|
|
NvEncEncodeSession *encode_session;
|
|
NV_ENC_LOCK_BITSTREAM lock_bitstream = {0};
|
|
gboolean success = FALSE;
|
|
|
|
if (!g_hash_table_lookup_extended (hwaccel_nvidia->encode_sessions,
|
|
GUINT_TO_POINTER (encode_session_id),
|
|
NULL, (gpointer *) &encode_session))
|
|
g_assert_not_reached ();
|
|
|
|
g_assert (encode_session->mapped_resource != NULL);
|
|
g_assert (encode_session->registered_resource != NULL);
|
|
|
|
lock_bitstream.version = NV_ENC_LOCK_BITSTREAM_VER;
|
|
lock_bitstream.outputBitstream = encode_session->buffer_out;
|
|
|
|
if (nvenc_api->nvEncLockBitstream (encode_session->encoder,
|
|
&lock_bitstream) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Failed to lock bitstream: %s",
|
|
get_last_nvenc_error_string (hwaccel_nvidia, encode_session->encoder));
|
|
goto out;
|
|
}
|
|
|
|
if (bitstream_size)
|
|
*bitstream_size = lock_bitstream.bitstreamSizeInBytes;
|
|
if (bitstream)
|
|
*bitstream = g_memdup2 (lock_bitstream.bitstreamBufferPtr, *bitstream_size);
|
|
|
|
nvenc_api->nvEncUnlockBitstream (encode_session->encoder,
|
|
lock_bitstream.outputBitstream);
|
|
|
|
success = TRUE;
|
|
|
|
out:
|
|
nvenc_api->nvEncUnmapInputResource (encode_session->encoder,
|
|
encode_session->mapped_resource);
|
|
nvenc_api->nvEncUnregisterResource (encode_session->encoder,
|
|
encode_session->registered_resource);
|
|
encode_session->mapped_resource = NULL;
|
|
encode_session->registered_resource = NULL;
|
|
|
|
return success;
|
|
}
|
|
|
|
static gboolean
|
|
load_extra_cuda_functions (GrdHwAccelNvidia *hwaccel_nvidia)
|
|
{
|
|
ExtraCudaFunctions *extra_cuda_funcs;
|
|
|
|
hwaccel_nvidia->cuda_lib = dlopen ("libcuda.so.1", RTLD_LAZY);
|
|
if (!hwaccel_nvidia->cuda_lib)
|
|
return FALSE;
|
|
|
|
hwaccel_nvidia->extra_cuda_funcs = g_malloc0 (sizeof (ExtraCudaFunctions));
|
|
|
|
extra_cuda_funcs = hwaccel_nvidia->extra_cuda_funcs;
|
|
extra_cuda_funcs->cuGraphicsGLRegisterBuffer =
|
|
dlsym (hwaccel_nvidia->cuda_lib, "cuGraphicsGLRegisterBuffer");
|
|
if (!extra_cuda_funcs->cuGraphicsGLRegisterBuffer)
|
|
return FALSE;
|
|
|
|
extra_cuda_funcs->cuGraphicsResourceGetMappedPointer =
|
|
dlsym (hwaccel_nvidia->cuda_lib, "cuGraphicsResourceGetMappedPointer_v2");
|
|
if (!extra_cuda_funcs->cuGraphicsGLRegisterBuffer)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
get_cuda_devices_in_impl (gpointer user_data)
|
|
{
|
|
DevRetrievalData *data = user_data;
|
|
GrdHwAccelNvidia *hwaccel_nvidia = data->hwaccel_nvidia;
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
|
|
return cuda_funcs->cuGLGetDevices (&data->n_devices, data->devices,
|
|
MAX_CUDA_DEVICES_FOR_RETRIEVAL,
|
|
CU_GL_DEVICE_LIST_ALL) == CUDA_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
compute_devices_ready (gboolean success,
|
|
gpointer user_data)
|
|
{
|
|
GrdSyncPoint *sync_point = user_data;
|
|
|
|
grd_sync_point_complete (sync_point, success);
|
|
}
|
|
|
|
static gboolean
|
|
get_cuda_devices_from_gl_context (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
GrdEglThread *egl_thread,
|
|
unsigned int *n_returned_devices,
|
|
CUdevice *device_array)
|
|
{
|
|
DevRetrievalData data = {};
|
|
gboolean success;
|
|
|
|
grd_sync_point_init (&data.sync_point);
|
|
data.hwaccel_nvidia = hwaccel_nvidia;
|
|
data.devices = device_array;
|
|
|
|
grd_egl_thread_run_custom_task (egl_thread,
|
|
get_cuda_devices_in_impl,
|
|
&data,
|
|
compute_devices_ready,
|
|
&data.sync_point,
|
|
NULL);
|
|
|
|
success = grd_sync_point_wait_for_completion (&data.sync_point);
|
|
grd_sync_point_clear (&data.sync_point);
|
|
|
|
*n_returned_devices = data.n_devices;
|
|
|
|
return success;
|
|
}
|
|
|
|
static gboolean
|
|
push_cuda_context_in_egl_thread (gpointer user_data)
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia = user_data;
|
|
|
|
grd_hwaccel_nvidia_push_cuda_context (hwaccel_nvidia);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
pop_cuda_context_in_egl_thread (gpointer user_data)
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia = user_data;
|
|
|
|
grd_hwaccel_nvidia_pop_cuda_context (hwaccel_nvidia);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static void
|
|
complete_sync (gboolean success,
|
|
gpointer user_data)
|
|
{
|
|
GrdSyncPoint *sync_point = user_data;
|
|
|
|
grd_sync_point_complete (sync_point, success);
|
|
}
|
|
|
|
static void
|
|
run_function_in_egl_thread (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
GrdEglThreadCustomFunc function)
|
|
{
|
|
GrdSyncPoint sync_point = {};
|
|
|
|
grd_sync_point_init (&sync_point);
|
|
|
|
grd_egl_thread_run_custom_task (hwaccel_nvidia->egl_thread,
|
|
function,
|
|
hwaccel_nvidia,
|
|
complete_sync,
|
|
&sync_point,
|
|
NULL);
|
|
|
|
grd_sync_point_wait_for_completion (&sync_point);
|
|
grd_sync_point_clear (&sync_point);
|
|
}
|
|
|
|
static gboolean
|
|
load_cuda_module (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUmodule *module,
|
|
const char *name,
|
|
const char *ptx_instructions)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuModuleLoadData (module, ptx_instructions);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to load %s module: %s",
|
|
name, get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
load_cuda_function (GrdHwAccelNvidia *hwaccel_nvidia,
|
|
CUfunction *function,
|
|
CUmodule module,
|
|
const char *name)
|
|
{
|
|
CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
CUresult cu_result;
|
|
|
|
cu_result = cuda_funcs->cuModuleGetFunction (function, module, name);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to get kernel %s: %s",
|
|
name, get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
GrdHwAccelNvidia *
|
|
grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
|
|
{
|
|
g_autoptr (GrdHwAccelNvidia) hwaccel_nvidia = NULL;
|
|
gboolean cuda_device_found = FALSE;
|
|
CUdevice cu_devices[MAX_CUDA_DEVICES_FOR_RETRIEVAL] = {};
|
|
CUdevice cu_device = 0;
|
|
unsigned int cu_device_count = 0;
|
|
CudaFunctions *cuda_funcs;
|
|
NvencFunctions *nvenc_funcs;
|
|
g_autofree char *dmg_ptx_path = NULL;
|
|
g_autofree char *dmg_ptx_instructions = NULL;
|
|
g_autofree char *avc_ptx_path = NULL;
|
|
g_autofree char *avc_ptx_instructions = NULL;
|
|
g_autoptr (GError) error = NULL;
|
|
CUresult cu_result;
|
|
unsigned int i;
|
|
|
|
hwaccel_nvidia = g_object_new (GRD_TYPE_HWACCEL_NVIDIA, NULL);
|
|
hwaccel_nvidia->egl_thread = egl_thread;
|
|
|
|
cuda_load_functions (&hwaccel_nvidia->cuda_funcs, NULL);
|
|
nvenc_load_functions (&hwaccel_nvidia->nvenc_funcs, NULL);
|
|
|
|
if (!hwaccel_nvidia->cuda_funcs || !hwaccel_nvidia->nvenc_funcs)
|
|
{
|
|
g_debug ("[HWAccel.CUDA] Failed to load CUDA or NVENC library");
|
|
return NULL;
|
|
}
|
|
if (!load_extra_cuda_functions (hwaccel_nvidia))
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to load extra CUDA functions");
|
|
return NULL;
|
|
}
|
|
|
|
cuda_funcs = hwaccel_nvidia->cuda_funcs;
|
|
nvenc_funcs = hwaccel_nvidia->nvenc_funcs;
|
|
|
|
cu_result = cuda_funcs->cuInit (0);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_debug ("[HWAccel.CUDA] Failed to initialize CUDA: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return NULL;
|
|
}
|
|
if (!get_cuda_devices_from_gl_context (hwaccel_nvidia, egl_thread,
|
|
&cu_device_count, cu_devices))
|
|
{
|
|
g_message ("[HWAccel.CUDA] Unable to retrieve CUDA devices");
|
|
return NULL;
|
|
}
|
|
|
|
g_debug ("[HWAccel.CUDA] Retrieved %u CUDA device(s)", cu_device_count);
|
|
for (i = 0; i < cu_device_count; ++i)
|
|
{
|
|
int cc_major = 0, cc_minor = 0;
|
|
|
|
cu_device = cu_devices[i];
|
|
cu_result =
|
|
cuda_funcs->cuDeviceGetAttribute (&cc_major,
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
|
cu_device);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to get device attribute: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
continue;
|
|
}
|
|
cu_result =
|
|
cuda_funcs->cuDeviceGetAttribute (&cc_minor,
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
cu_device);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to get device attribute: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
continue;
|
|
}
|
|
|
|
g_debug ("[HWAccel.CUDA] Device %u compute capability: [%i, %i]",
|
|
i, cc_major, cc_minor);
|
|
if (cc_major >= 3)
|
|
{
|
|
g_debug ("[HWAccel.CUDA] Choosing CUDA device with id %u", i);
|
|
cuda_device_found = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!cu_device_count || !cuda_device_found)
|
|
{
|
|
g_debug ("[HWAccel.CUDA] No appropriate CUDA capable gpu found");
|
|
return NULL;
|
|
}
|
|
|
|
hwaccel_nvidia->cu_device = cu_device;
|
|
cu_result = cuda_funcs->cuDevicePrimaryCtxRetain (&hwaccel_nvidia->cu_context,
|
|
hwaccel_nvidia->cu_device);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to retain CUDA context: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
return NULL;
|
|
}
|
|
|
|
hwaccel_nvidia->nvenc_api.version = NV_ENCODE_API_FUNCTION_LIST_VER;
|
|
if (nvenc_funcs->NvEncodeAPICreateInstance (&hwaccel_nvidia->nvenc_api) != NV_ENC_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.NVENC] Could not create NVENC API instance");
|
|
cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device);
|
|
return NULL;
|
|
}
|
|
|
|
cu_result = cuda_funcs->cuCtxPushCurrent (hwaccel_nvidia->cu_context);
|
|
if (cu_result != CUDA_SUCCESS)
|
|
{
|
|
g_warning ("[HWAccel.CUDA] Failed to push CUDA context: %s",
|
|
get_cuda_error_string (hwaccel_nvidia, cu_result));
|
|
cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device);
|
|
return NULL;
|
|
}
|
|
|
|
run_function_in_egl_thread (hwaccel_nvidia, push_cuda_context_in_egl_thread);
|
|
|
|
hwaccel_nvidia->initialized = TRUE;
|
|
|
|
dmg_ptx_path = g_strdup_printf ("%s/grd-cuda-damage-utils_30.ptx", GRD_DATA_DIR);
|
|
avc_ptx_path = g_strdup_printf ("%s/grd-cuda-avc-utils_30.ptx", GRD_DATA_DIR);
|
|
|
|
if (!g_file_get_contents (dmg_ptx_path, &dmg_ptx_instructions, NULL, &error) ||
|
|
!g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error))
|
|
g_error ("[HWAccel.CUDA] Failed to read PTX instructions: %s", error->message);
|
|
|
|
if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_dmg_utils,
|
|
"damage utils", dmg_ptx_instructions))
|
|
return NULL;
|
|
|
|
if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_chk_dmg_pxl,
|
|
hwaccel_nvidia->cu_module_dmg_utils, "check_damaged_pixel") ||
|
|
!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_cols,
|
|
hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_cols") ||
|
|
!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_rows,
|
|
hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_rows") ||
|
|
!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_simplify_dmg_arr,
|
|
hwaccel_nvidia->cu_module_dmg_utils, "simplify_damage_array"))
|
|
return NULL;
|
|
|
|
if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_avc_utils,
|
|
"AVC utils", avc_ptx_instructions))
|
|
return NULL;
|
|
|
|
if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_bgrx_to_yuv420,
|
|
hwaccel_nvidia->cu_module_avc_utils, "convert_2x2_bgrx_area_to_yuv420_nv12"))
|
|
return NULL;
|
|
|
|
return g_steal_pointer (&hwaccel_nvidia);
|
|
}
|
|
|
|
static void
|
|
grd_hwaccel_nvidia_dispose (GObject *object)
|
|
{
|
|
GrdHwAccelNvidia *hwaccel_nvidia = GRD_HWACCEL_NVIDIA (object);
|
|
|
|
g_clear_pointer (&hwaccel_nvidia->cu_module_avc_utils,
|
|
hwaccel_nvidia->cuda_funcs->cuModuleUnload);
|
|
g_clear_pointer (&hwaccel_nvidia->cu_module_dmg_utils,
|
|
hwaccel_nvidia->cuda_funcs->cuModuleUnload);
|
|
|
|
if (hwaccel_nvidia->initialized)
|
|
{
|
|
run_function_in_egl_thread (hwaccel_nvidia, pop_cuda_context_in_egl_thread);
|
|
|
|
hwaccel_nvidia->cuda_funcs->cuCtxPopCurrent (&hwaccel_nvidia->cu_context);
|
|
hwaccel_nvidia->cuda_funcs->cuDevicePrimaryCtxRelease (hwaccel_nvidia->cu_device);
|
|
|
|
hwaccel_nvidia->initialized = FALSE;
|
|
}
|
|
|
|
g_clear_pointer (&hwaccel_nvidia->cuda_lib, dlclose);
|
|
g_clear_pointer (&hwaccel_nvidia->extra_cuda_funcs, g_free);
|
|
|
|
nvenc_free_functions (&hwaccel_nvidia->nvenc_funcs);
|
|
cuda_free_functions (&hwaccel_nvidia->cuda_funcs);
|
|
|
|
g_assert (!hwaccel_nvidia->encode_sessions ||
|
|
g_hash_table_size (hwaccel_nvidia->encode_sessions) == 0);
|
|
g_clear_pointer (&hwaccel_nvidia->encode_sessions, g_hash_table_destroy);
|
|
|
|
G_OBJECT_CLASS (grd_hwaccel_nvidia_parent_class)->dispose (object);
|
|
}
|
|
|
|
static void
|
|
grd_hwaccel_nvidia_init (GrdHwAccelNvidia *hwaccel_nvidia)
|
|
{
|
|
hwaccel_nvidia->encode_sessions = g_hash_table_new (NULL, NULL);
|
|
}
|
|
|
|
static void
|
|
grd_hwaccel_nvidia_class_init (GrdHwAccelNvidiaClass *klass)
|
|
{
|
|
GObjectClass *object_class = G_OBJECT_CLASS (klass);
|
|
|
|
object_class->dispose = grd_hwaccel_nvidia_dispose;
|
|
}
|