diff --git a/CMakeLists.txt b/CMakeLists.txt index e9498d08a..6c20267f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -754,6 +754,10 @@ set(OPENH264_FEATURE_TYPE "OPTIONAL") set(OPENH264_FEATURE_PURPOSE "codec") set(OPENH264_FEATURE_DESCRIPTION "use OpenH264 library") +set(OPENCL_FEATURE_TYPE "OPTIONAL") +set(OPENCL_FEATURE_PURPOSE "codec") +set(OPENCL_FEATURE_DESCRIPTION "use OpenCL library") + set(GSM_FEATURE_TYPE "OPTIONAL") set(GSM_FEATURE_PURPOSE "codec") set(GSM_FEATURE_DESCRIPTION "GSM audio codec library") @@ -866,6 +870,7 @@ find_feature(GStreamer_1_0 ${GSTREAMER_1_0_FEATURE_TYPE} ${GSTREAMER_1_0_FEATURE find_feature(JPEG ${JPEG_FEATURE_TYPE} ${JPEG_FEATURE_PURPOSE} ${JPEG_FEATURE_DESCRIPTION}) find_feature(x264 ${X264_FEATURE_TYPE} ${X264_FEATURE_PURPOSE} ${X264_FEATURE_DESCRIPTION}) find_feature(OpenH264 ${OPENH264_FEATURE_TYPE} ${OPENH264_FEATURE_PURPOSE} ${OPENH264_FEATURE_DESCRIPTION}) +find_feature(OpenCL ${OPENCL_FEATURE_TYPE} ${OPENCL_FEATURE_PURPOSE} ${OPENCL_FEATURE_DESCRIPTION}) find_feature(GSM ${GSM_FEATURE_TYPE} ${GSM_FEATURE_PURPOSE} ${GSM_FEATURE_DESCRIPTION}) find_feature(LAME ${LAME_FEATURE_TYPE} ${LAME_FEATURE_PURPOSE} ${LAME_FEATURE_DESCRIPTION}) find_feature(FAAD2 ${FAAD2_FEATURE_TYPE} ${FAAD2_FEATURE_PURPOSE} ${FAAD2_FEATURE_DESCRIPTION}) diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake new file mode 100644 index 000000000..388d09855 --- /dev/null +++ b/cmake/FindOpenCL.cmake @@ -0,0 +1,41 @@ +# - Try to find the OpenCL library +# Once done this will define +# +# OPENCL_ROOT - A list of search hints +# +# OPENCL_FOUND - system has OpenCL +# OPENCL_INCLUDE_DIR - the OpenCL include directory +# OPENCL_LIBRARIES - opencl library + +if (OPENCL_INCLUDE_DIR AND OPENCL_LIBRARY) + set(OPENCL_FIND_QUIETLY TRUE) +endif() + +find_path(OPENCL_INCLUDE_DIR NAMES OpenCL/opencl.h CL/cl.h + PATH_SUFFIXES include + HINTS ${OPENCL_ROOT}) +find_library(OPENCL_LIBRARY + NAMES OpenCL + PATH_SUFFIXES lib + HINTS ${OPENCL_ROOT}) + +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARY OPENCL_INCLUDE_DIR) + +if (OPENCL_INCLUDE_DIR AND OPENCL_LIBRARY) + set(OPENCL_FOUND TRUE) + set(OPENCL_LIBRARIES ${OPENCL_LIBRARY}) +endif() + +if (OPENCL_FOUND) + if (NOT OPENCL_FIND_QUIETLY) + message(STATUS "Found OpenCL: ${OPENCL_LIBRARIES}") + endif() +else() + if (OPENCL_FIND_REQUIRED) + message(FATAL_ERROR "OpenCL was not found") + endif() +endif() + +mark_as_advanced(OPENCL_INCLUDE_DIR OPENCL_LIBRARY) + diff --git a/config.h.in b/config.h.in index 2aa3427d9..1cb397638 100644 --- a/config.h.in +++ b/config.h.in @@ -66,6 +66,7 @@ #cmakedefine WITH_DSP_EXPERIMENTAL #cmakedefine WITH_DSP_FFMPEG #cmakedefine WITH_X264 +#cmakedefine WITH_OPENCL #cmakedefine WITH_MEDIA_FOUNDATION #cmakedefine WITH_VAAPI @@ -169,4 +170,5 @@ #cmakedefine WITH_DEBUG_X11_LOCAL_MOVESIZE #cmakedefine WITH_DEBUG_XV #cmakedefine WITH_DEBUG_RINGBUFFER + #endif /* FREERDP_CONFIG_H */ diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index f07742f21..759b438b2 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -56,6 +56,13 @@ typedef INT32 pstatus_t; /* match IppStatus. */ #define PRIM_ARM_IWMMXT_AVAILABLE (1U << 6) #define PRIM_ARM_NEON_AVAILABLE (1U << 7) +/** @brief flags of primitives */ +enum +{ + PRIM_FLAGS_HAVE_EXTCPU = (1U << 0), /* primitives are using CPU extensions */ + PRIM_FLAGS_HAVE_EXTGPU = (1U << 1), /* primitives are using the GPU */ +}; + /* Structures compatible with IPP */ typedef struct { @@ -135,6 +142,8 @@ typedef pstatus_t (*__RGBToAVC444YUV_t)(const BYTE* pSrc, UINT32 srcFormat, UINT const prim_size_t* roi); typedef pstatus_t (*__andC_32u_t)(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len); typedef pstatus_t (*__orC_32u_t)(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len); +typedef pstatus_t (*primitives_uninit_t)(void); + typedef struct { @@ -177,15 +186,31 @@ typedef struct __YUV444ToRGB_8u_P3AC4R_t YUV444ToRGB_8u_P3AC4R; __RGBToAVC444YUV_t RGBToAVC444YUV; __RGBToAVC444YUV_t RGBToAVC444YUVv2; + /* flags */ + DWORD flags; + primitives_uninit_t uninit; } primitives_t; +typedef enum +{ + PRIMITIVES_PURE_SOFT, /** use generic software implementation */ + PRIMITIVES_ONLY_CPU, /** use generic software or cpu optimized routines */ + PRIMITIVES_AUTODETECT /** detect the best routines */ +} primitive_hints; + #ifdef __cplusplus extern "C" { #endif FREERDP_API primitives_t* primitives_get(void); + FREERDP_API void primitives_set_hints(primitive_hints hints); + FREERDP_API primitive_hints primitives_get_hints(void); FREERDP_API primitives_t* primitives_get_generic(void); + FREERDP_API DWORD primitives_flags(primitives_t *p); + FREERDP_API BOOL primitives_init(primitives_t *p, primitive_hints hints); + FREERDP_API void primitives_uninit(); + #ifdef __cplusplus } diff --git a/libfreerdp/CMakeLists.txt b/libfreerdp/CMakeLists.txt index 534991db4..59bdb1d4f 100644 --- a/libfreerdp/CMakeLists.txt +++ b/libfreerdp/CMakeLists.txt @@ -292,10 +292,21 @@ if (WITH_NEON) primitives/prim_YUV_neon.c) endif() +if (WITH_OPENCL) + freerdp_definition_add(-DOPENCL_SOURCE_PATH="${CMAKE_INSTALL_PREFIX}/${FREERDP_PLUGIN_PATH}") + set(PRIMITIVES_OPENCL_SRCS primitives/prim_YUV_opencl.c) + + freerdp_include_directory_add(${OPENCL_INCLUDE_DIRS}) + freerdp_library_add(${OPENCL_LIBRARIES}) + + install(FILES primitives/primitives.cl DESTINATION ${FREERDP_PLUGIN_PATH}) +endif() + set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_SSE2_SRCS} ${PRIMITIVES_SSE3_SRCS} - ${PRIMITIVES_SSSE3_SRCS}) + ${PRIMITIVES_SSSE3_SRCS} + ${PRIMITIVES_OPENCL_SRCS}) freerdp_definition_add(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) diff --git a/libfreerdp/codec/yuv.c b/libfreerdp/codec/yuv.c index a97091e4f..951bea7dd 100644 --- a/libfreerdp/codec/yuv.c +++ b/libfreerdp/codec/yuv.c @@ -108,10 +108,10 @@ BOOL yuv_context_decode(YUV_CONTEXT* context, const BYTE* pYUVData[3], UINT32 iS YUV_PROCESS_WORK_PARAM* params; UINT32 waitCount = 0; BOOL ret = TRUE; + primitives_t* prims = primitives_get(); - if (!context->useThreads) + if (!context->useThreads || (primitives_flags(prims) & PRIM_FLAGS_HAVE_EXTGPU)) { - primitives_t* prims = primitives_get(); prim_size_t roi; roi.width = context->width; roi.height = context->height; diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index 333677f06..d6e0d3eeb 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -460,9 +460,9 @@ static pstatus_t general_YUV444ToRGB_8u_P3AC4R(const BYTE* const pSrc[3], const } } /** - * | R | ( | 256 0 403 | | Y | ) - * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 - * | B | ( | 256 475 0 | | V - 128 | ) + * | R | ( | 256 0 403 | | Y | ) + * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 + * | B | ( | 256 475 0 | | V - 128 | ) */ static pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* const pSrc[3], const UINT32 srcStep[3], BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, diff --git a/libfreerdp/primitives/prim_YUV_opencl.c b/libfreerdp/primitives/prim_YUV_opencl.c new file mode 100644 index 000000000..a7a013a18 --- /dev/null +++ b/libfreerdp/primitives/prim_YUV_opencl.c @@ -0,0 +1,180 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Optimized YUV/RGB conversion operations using openCL + * + * Copyright 2019 David Fort + * Copyright 2019 Rangee Gmbh + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include "prim_internal.h" + + +#define TAG FREERDP_TAG("primitives") + + + +static pstatus_t opencl_YUV420ToRGB(const char *kernelName, const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, const prim_size_t* roi) +{ + cl_int ret; + int i; + cl_mem objs[3] = {NULL, NULL, NULL}; + cl_mem destObj; + cl_kernel kernel; + size_t indexes[2]; + const char *sourceNames[] = {"Y", "U", "V"}; + primitives_opencl_context *cl = primitives_get_opencl_context(); + + kernel = clCreateKernel(cl->program, kernelName, &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: unable to create kernel %s", kernelName); + return -1; + } + + for (i = 0; i < 3; i++) + { + objs[i] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, srcStep[i] * roi->height, NULL, &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]); + goto error_objs; + } + + ret = clEnqueueWriteBuffer(cl->commandQueue, objs[i], CL_TRUE, 0, srcStep[i] * roi->height, + pSrc[i], 0, NULL, NULL); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to enqueue write command for %sobj", sourceNames[i]); + goto error_objs; + } + } + + destObj = clCreateBuffer(cl->context, CL_MEM_WRITE_ONLY, dstStep * roi->height, NULL, &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to create dest obj"); + goto error_objs; + } + + /* push source + stride arguments*/ + for (i = 0; i < 3; i++) + { + ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), (void *)&objs[i]); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]); + goto error_set_args; + } + + ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), (void *)&srcStep[i]); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]); + goto error_set_args; + } + } + + ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&destObj); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to set arg destObj"); + goto error_set_args; + } + + ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&dstStep); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to set arg dstStep"); + goto error_set_args; + } + + indexes[0] = roi->width; + indexes[1] = roi->height; + ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL, + 0, NULL, NULL); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to enqueue call kernel"); + goto error_set_args; + } + + /* Transfer result to host */ + ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0, NULL, NULL); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "unable to read back buffer"); + goto error_set_args; + } + + /* cleanup things */ + clReleaseMemObject(destObj); + for (i = 0; i < 3; i++) + if (objs[i]) + clReleaseMemObject(objs[i]); + clReleaseKernel(kernel); + + return PRIMITIVES_SUCCESS; + +error_set_args: + clReleaseMemObject(destObj); +error_objs: + for (i = 0; i < 3; i++) + { + if (objs[i]) + clReleaseMemObject(objs[i]); + } + clReleaseKernel(kernel); + return -1; +} + + +static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, const prim_size_t* roi) +{ + const char *kernel_name; + + switch(DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + kernel_name = "yuv420_to_bgra_1b"; + break; + case PIXEL_FORMAT_XRGB32: + case PIXEL_FORMAT_ARGB32: + kernel_name = "yuv420_to_argb_1b"; + break; + default: { + primitives_opencl_context *cl = primitives_get_opencl_context(); + return cl->YUV420ToRGB_backup(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } + } + + return opencl_YUV420ToRGB(kernel_name, pSrc, srcStep, pDst, dstStep, roi); +} + +void primitives_init_YUV_opencl(primitives_t* prims) +{ + prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R; + +} + + diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c index 8dc39dcf5..beda11dc6 100644 --- a/libfreerdp/primitives/prim_YUV_ssse3.c +++ b/libfreerdp/primitives/prim_YUV_ssse3.c @@ -1448,6 +1448,7 @@ static pstatus_t ssse3_YUV420CombineToYUV444(avc444_frame_type type, const BYTE* } } + void primitives_init_YUV_opt(primitives_t* prims) { generic = primitives_get_generic(); diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h index ce1e8d197..fb04e517d 100644 --- a/libfreerdp/primitives/prim_internal.h +++ b/libfreerdp/primitives/prim_internal.h @@ -32,10 +32,15 @@ #endif #endif -#if defined(WITH_SSE2) || defined(WITH_NEON) +#if defined(WITH_SSE2) || defined(WITH_NEON) || defined(WITH_OPENCL) #define HAVE_OPTIMIZED_PRIMITIVES 1 #endif +#if defined(WITH_SSE2) || defined(WITH_NEON) +#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1 +#endif + + #if defined(WITH_SSE2) /* Use lddqu for unaligned; load for 16-byte aligned. */ #define LOAD_SI128(_ptr_) \ @@ -203,4 +208,30 @@ FREERDP_LOCAL void primitives_init_YCoCg_opt(primitives_t* prims); FREERDP_LOCAL void primitives_init_YUV_opt(primitives_t* prims); #endif +#if defined(WITH_OPENCL) +#ifdef __APPLE__ +#include "OpenCL/opencl.h" +#else +#include +#endif + +typedef struct +{ + BOOL support; + cl_platform_id platformId; + cl_device_id deviceId; + cl_context context; + cl_command_queue commandQueue; + cl_program program; + __YUV420ToRGB_8u_P3AC4R_t YUV420ToRGB_backup; +} primitives_opencl_context; + + +FREERDP_LOCAL BOOL primitives_init_opencl(primitives_t* prims); +FREERDP_LOCAL pstatus_t primitives_uninit_opencl(void); +FREERDP_LOCAL primitives_opencl_context *primitives_get_opencl_context(void); + +FREERDP_LOCAL void primitives_init_YUV_opencl(primitives_t* prims); +#endif + #endif /* FREERDP_LIB_PRIM_INTERNAL_H */ diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c index 1bc959a1f..b55232d19 100644 --- a/libfreerdp/primitives/primitives.c +++ b/libfreerdp/primitives/primitives.c @@ -4,6 +4,7 @@ * * Copyright 2011 Martin Fleisz * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. + * Copyright 2019 David Fort * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain @@ -23,64 +24,482 @@ #include #include +#include +#include #include #include "prim_internal.h" +#define TAG FREERDP_TAG("primitives") + +/* hints to know which kind of primitives to use */ +static primitive_hints primitivesHints = PRIMITIVES_AUTODETECT; + +void primitives_set_hints(primitive_hints hints) +{ + primitivesHints = hints; +} + +primitive_hints primitives_get_hints(void) +{ + return primitivesHints; +} + + /* Singleton pointer used throughout the program when requested. */ static primitives_t pPrimitivesGeneric = { 0 }; static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT; -#if defined(HAVE_OPTIMIZED_PRIMITIVES) + static primitives_t pPrimitives = { 0 }; static INIT_ONCE primitives_InitOnce = INIT_ONCE_STATIC_INIT; -#endif /* ------------------------------------------------------------------------- */ -static BOOL CALLBACK primitives_init_generic(PINIT_ONCE once, PVOID param, PVOID* context) +static BOOL primitives_init_generic(primitives_t *prims) { - WINPR_UNUSED(once); - WINPR_UNUSED(param); - WINPR_UNUSED(context); - primitives_init_add(&pPrimitivesGeneric); - primitives_init_andor(&pPrimitivesGeneric); - primitives_init_alphaComp(&pPrimitivesGeneric); - primitives_init_copy(&pPrimitivesGeneric); - primitives_init_set(&pPrimitivesGeneric); - primitives_init_shift(&pPrimitivesGeneric); - primitives_init_sign(&pPrimitivesGeneric); - primitives_init_colors(&pPrimitivesGeneric); - primitives_init_YCoCg(&pPrimitivesGeneric); - primitives_init_YUV(&pPrimitivesGeneric); + primitives_init_add(prims); + primitives_init_andor(prims); + primitives_init_alphaComp(prims); + primitives_init_copy(prims); + primitives_init_set(prims); + primitives_init_shift(prims); + primitives_init_sign(prims); + primitives_init_colors(prims); + primitives_init_YCoCg(prims); + primitives_init_YUV(prims); + prims->uninit = NULL; return TRUE; } -#if defined(HAVE_OPTIMIZED_PRIMITIVES) -static BOOL CALLBACK primitives_init(PINIT_ONCE once, PVOID param, PVOID* context) +static BOOL CALLBACK primitives_init_generic_cb(PINIT_ONCE once, PVOID param, PVOID* context) { WINPR_UNUSED(once); WINPR_UNUSED(param); WINPR_UNUSED(context); - /* Now call each section's initialization routine. */ - primitives_init_add_opt(&pPrimitives); - primitives_init_andor_opt(&pPrimitives); - primitives_init_alphaComp_opt(&pPrimitives); - primitives_init_copy_opt(&pPrimitives); - primitives_init_set_opt(&pPrimitives); - primitives_init_shift_opt(&pPrimitives); - primitives_init_sign_opt(&pPrimitives); - primitives_init_colors_opt(&pPrimitives); - primitives_init_YCoCg_opt(&pPrimitives); - primitives_init_YUV_opt(&pPrimitives); + return primitives_init_generic(&pPrimitivesGeneric); +} + +static BOOL primitives_init_optimized(primitives_t *prims) +{ + primitives_init_generic(prims); + +#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES) + primitives_init_add_opt(prims); + primitives_init_andor_opt(prims); + primitives_init_alphaComp_opt(prims); + primitives_init_copy_opt(prims); + primitives_init_set_opt(prims); + primitives_init_shift_opt(prims); + primitives_init_sign_opt(prims); + primitives_init_colors_opt(prims); + primitives_init_YCoCg_opt(prims); + primitives_init_YUV_opt(prims); + prims->flags |= PRIM_FLAGS_HAVE_EXTCPU; +#endif return TRUE; } + +typedef struct { + BYTE *channels[3]; + UINT32 steps[3]; + prim_size_t roi; + BYTE *outputBuffer; + UINT32 outputStride; + UINT32 testedFormat; +} primitives_YUV_benchmark; + +static primitives_YUV_benchmark* primitives_YUV_benchmark_init(void) +{ + int i; + primitives_YUV_benchmark *ret = calloc(1, sizeof(*ret)); + prim_size_t *roi; + if (!ret) + return NULL; + + roi = &ret->roi; + roi->width = 1024; + roi->height = 768; + + ret->outputStride = roi->width *4; + ret->testedFormat = PIXEL_FORMAT_BGRA32; + + ret->outputBuffer = malloc(roi->width * roi->height * 4); + if (!ret->outputBuffer) + goto error_output; + + for (i = 0; i < 3; i++) + { + BYTE *buf = ret->channels[i] = malloc(roi->width * roi->height); + if (!buf) + goto error_channels; + + winpr_RAND(buf, roi->width * roi->height); + ret->steps[i] = roi->width; + } + + return ret; + +error_channels: + for(i = 0; i < 3; i++) + free(ret->channels[i]); +error_output: + free(ret); + return NULL; +} + +static void primitives_YUV_benchmark_free(primitives_YUV_benchmark **pbench) +{ + int i; + primitives_YUV_benchmark *bench; + if (!*pbench) + return; + bench = *pbench; + + free(bench->outputBuffer); + + for (i = 0; i < 3; i++) + free(bench->channels[i]); + + free(bench); + *pbench = NULL; +} + +static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark *bench, primitives_t *prims, + UINT64 runTime, UINT32 *computations) +{ + ULONGLONG dueDate = GetTickCount64() + runTime; + const BYTE *channels[3]; + int i; + + *computations = 0; + + for (i = 0; i < 3; i++) + channels[i] = bench->channels[i]; + + while (GetTickCount64() < dueDate) + { + pstatus_t status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer, + bench->outputStride, bench->testedFormat, &bench->roi); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + *computations = *computations + 1; + } + return TRUE; +} + +static BOOL primitives_autodetect_best(primitives_t *prims) +{ + BOOL ret = FALSE; + UINT64 benchDuration = 150; // 100 ms + UINT32 genericCount = 0, optimizedCount = 0, openclCount = 0; + UINT32 bestCount; + primitives_t *genericPrims = primitives_get_generic(); + primitives_t optimizedPrims; +#if defined(WITH_OPENCL) + primitives_t openclPrims; #endif + const char *primName = "generic"; + + primitives_YUV_benchmark *yuvBench = primitives_YUV_benchmark_init(); + if (!yuvBench) + return FALSE; + + if (!primitives_YUV_benchmark_run(yuvBench, genericPrims, benchDuration, &genericCount)) + { + WLog_ERR(TAG, "error running generic YUV bench"); + goto out; + } + + if (!primitives_init_optimized(&optimizedPrims)) + { + WLog_ERR(TAG, "error initializing CPU optimized primitives"); + goto out; + } + + if(optimizedPrims.flags & PRIM_FLAGS_HAVE_EXTCPU) /* run the test only if we really have optimizations */ + { + if (!primitives_YUV_benchmark_run(yuvBench, &optimizedPrims, benchDuration, &optimizedCount)) + { + WLog_ERR(TAG, "error running optimized YUV bench"); + goto out; + } + } + +#if defined(WITH_OPENCL) + if (primitives_init_opencl(&openclPrims)) + { + if (!primitives_YUV_benchmark_run(yuvBench, &openclPrims, benchDuration, &openclCount)) + { + WLog_ERR(TAG, "error running opencl YUV bench"); + goto out; + } + } +#endif + + /* finally compute the results */ + bestCount = genericCount; + *prims = *genericPrims; + + if (bestCount < optimizedCount) + { + bestCount = optimizedCount; + *prims = optimizedPrims; + primName = "optimized"; + } + +#if defined(WITH_OPENCL) + if (bestCount < openclCount) + { + bestCount = openclCount; + *prims = openclPrims; + primName = "openCL"; + } +#endif + + WLog_DBG(TAG, "benchmark result: generic=%d optimized=%d openCL=%d", genericCount, optimizedCount, openclCount); + WLog_INFO(TAG, "primitives autodetect, using %s", primName); + ret = TRUE; +out: + primitives_YUV_benchmark_free(&yuvBench); + return ret; +} + +static BOOL CALLBACK primitives_init_cb(PINIT_ONCE once, PVOID param, PVOID* context) +{ + WINPR_UNUSED(once); + WINPR_UNUSED(param); + WINPR_UNUSED(context); + + return primitives_init(&pPrimitives, primitivesHints); +} + + +#if defined(WITH_OPENCL) +static primitives_opencl_context openclContext; + +primitives_opencl_context *primitives_get_opencl_context(void) +{ + return &openclContext; +} + +pstatus_t primitives_uninit_opencl(void) +{ + if (!openclContext.support) + return PRIMITIVES_SUCCESS; + + clReleaseProgram(openclContext.program); + clReleaseCommandQueue(openclContext.commandQueue); + clReleaseContext(openclContext.context); + clReleaseDevice(openclContext.deviceId); + openclContext.support = FALSE; + return PRIMITIVES_SUCCESS; +} + +BOOL primitives_init_opencl_context(primitives_opencl_context *cl) +{ + cl_platform_id *platform_ids = NULL; + cl_uint ndevices, nplatforms, i; + cl_kernel kernel; + cl_int ret; + char sourcePath[1000]; + primitives_t optimized; + + BOOL gotGPU = FALSE; + FILE *f; + size_t programLen; + char *programSource; + + if (!primitives_init_optimized(&optimized)) + return FALSE; + cl->YUV420ToRGB_backup = optimized.YUV420ToRGB_8u_P3AC4R; + + ret = clGetPlatformIDs(0, NULL, &nplatforms); + if (ret != CL_SUCCESS || nplatforms < 1) + return FALSE; + + platform_ids = calloc(nplatforms, sizeof(*platform_ids)); + if (!platform_ids) + return FALSE; + + ret = clGetPlatformIDs(nplatforms, platform_ids, &nplatforms); + if (ret != CL_SUCCESS) + { + free(platform_ids); + return FALSE; + } + + for (i = 0; (i < nplatforms) && !gotGPU; i++) + { + cl_device_id device_id; + cl_context context; + char platformName[1000]; + char deviceName[1000]; + + ret = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sizeof(platformName), platformName, NULL); + if (ret != CL_SUCCESS) + continue; + + ret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_GPU, 1, &device_id, &ndevices); + if (ret != CL_SUCCESS) + continue; + + ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: unable get device name for platform %s", platformName); + clReleaseDevice(device_id); + continue; + } + + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: unable to create context for platform %s, device %s", platformName, deviceName); + clReleaseDevice(device_id); + continue; + } + + cl->commandQueue = clCreateCommandQueue(context, device_id, 0, &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: unable to create command queue"); + clReleaseContext(context); + clReleaseDevice(device_id); + continue; + } + + WLog_INFO(TAG, "openCL: using platform=%s device=%s", platformName, deviceName); + + cl->platformId = platform_ids[i]; + cl->deviceId = device_id; + cl->context = context; + gotGPU = TRUE; + } + + free(platform_ids); + + if (!gotGPU) + { + WLog_ERR(TAG, "openCL: no GPU found"); + return FALSE; + } + + snprintf(sourcePath, sizeof(sourcePath), "%s/primitives.cl", OPENCL_SOURCE_PATH); + + f = fopen(sourcePath, "r"); + if (!f) + { + WLog_ERR(TAG, "openCL: unable to open source file %s", sourcePath); + goto error_source_file; + } + + fseek(f, 0, SEEK_END); + programLen = ftell(f); + fseek(f, 0, SEEK_SET); + + programSource = malloc(programLen); + if (!programSource) { + WLog_ERR(TAG, "openCL: unable to allocate memory(%d bytes) for source file %s", + programLen, sourcePath); + fclose(f); + goto error_source_file; + } + + if (fread(programSource, programLen, 1, f) <= 0) + { + WLog_ERR(TAG, "openCL: unable to read openCL program in %s", sourcePath); + free(programSource); + fclose(f); + goto error_source_file; + } + fclose(f); + + cl->program = clCreateProgramWithSource(cl->context, 1, (const char **)&programSource, + &programLen, &ret); + if (ret != CL_SUCCESS) { + WLog_ERR(TAG, "openCL: unable to create command queue"); + goto out_program_create; + } + free(programSource); + + ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL); + if (ret != CL_SUCCESS) + { + size_t length; + char buffer[2048]; + ret = clGetProgramBuildInfo(cl->program, cl->deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: building program failed but unable to retrieve buildLog, error=%d", ret); + } + else + { + WLog_ERR(TAG, "openCL: unable to build program, errorLog=%s", buffer); + } + goto out_program_build; + } + + kernel = clCreateKernel(cl->program, "yuv420_to_bgra_1b", &ret); + if (ret != CL_SUCCESS) + { + WLog_ERR(TAG, "openCL: unable to create yuv420_to_bgra_1b kernel"); + goto out_program_build; + } + clReleaseKernel(kernel); + + cl->support = TRUE; + return TRUE; + +out_program_build: + clReleaseProgram(cl->program); +error_source_file: +out_program_create: + clReleaseCommandQueue(cl->commandQueue); + clReleaseContext(cl->context); + clReleaseDevice(cl->deviceId); + return FALSE; +} + +BOOL primitives_init_opencl(primitives_t* prims) +{ + if (!primitives_init_opencl_context(&openclContext)) + return FALSE; + + primitives_init_optimized(prims); + primitives_init_YUV_opencl(prims); + prims->flags |= PRIM_FLAGS_HAVE_EXTGPU; + prims->uninit = primitives_uninit_opencl; + return TRUE; +} + +#endif + +BOOL primitives_init(primitives_t *p, primitive_hints hints) +{ + switch(hints) + { + case PRIMITIVES_PURE_SOFT: + return primitives_init_generic(p); + case PRIMITIVES_ONLY_CPU: + return primitives_init_optimized(p); + case PRIMITIVES_AUTODETECT: + return primitives_autodetect_best(p); + default: + WLog_ERR(TAG, "unknown hint %d", hints); + return FALSE; + } +} + +void primitives_uninit() { + if (pPrimitives.uninit) + pPrimitives.uninit(); +} /* ------------------------------------------------------------------------- */ primitives_t* primitives_get(void) { - InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic, NULL, NULL); + InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL); #if defined(HAVE_OPTIMIZED_PRIMITIVES) - InitOnceExecuteOnce(&primitives_InitOnce, primitives_init, NULL, NULL); + InitOnceExecuteOnce(&primitives_InitOnce, primitives_init_cb, NULL, NULL); return &pPrimitives; #else return &pPrimitivesGeneric; @@ -89,6 +508,11 @@ primitives_t* primitives_get(void) primitives_t* primitives_get_generic(void) { - InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic, NULL, NULL); + InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL); return &pPrimitivesGeneric; } + +DWORD primitives_flags(primitives_t *p) +{ + return p->flags; +} diff --git a/libfreerdp/primitives/primitives.cl b/libfreerdp/primitives/primitives.cl new file mode 100644 index 000000000..151bc370d --- /dev/null +++ b/libfreerdp/primitives/primitives.cl @@ -0,0 +1,82 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Optimized operations using openCL + * + * Copyright 2019 David Fort + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +unsigned char clamp_uc(int v, int l, int h) +{ + if (v > h) + v = h; + if (v < l) + v = l; + return (unsigned char)v; +} + +__kernel void yuv420_to_argb_1b( + __global unsigned char *bufY, int strideY, + __global unsigned char *bufU, int strideU, + __global unsigned char *bufV, int strideV, + __global unsigned char *dest, int strideDest) +{ + unsigned int x = get_global_id(0); + unsigned int y = get_global_id(1); + + int Y = bufY[y * strideY + x]; + int Udim = bufU[(y / 2) * strideU + (x / 2)] - 128; + int Vdim = bufV[(y / 2) * strideV + (x / 2)] - 128; + + __global unsigned char *destPtr = dest + (strideDest * y) + (x * 4); + + /** + * | R | ( | 256 0 403 | | Y | ) + * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 + * | B | ( | 256 475 0 | | V - 128 | ) + */ + int y256 = 256 * Y; + destPtr[0] = 0xff; /* A */ + destPtr[1] = clamp_uc((y256 + (403 * Vdim)) >> 8, 0, 255); /* R */ + destPtr[2] = clamp_uc((y256 - (48 * Udim) - (120 * Vdim)) >> 8 , 0, 255); /* G */ + destPtr[3] = clamp_uc((y256 + (475 * Udim)) >> 8, 0, 255); /* B */ +} + +__kernel void yuv420_to_bgra_1b( + __global unsigned char *bufY, int strideY, + __global unsigned char *bufU, int strideU, + __global unsigned char *bufV, int strideV, + __global unsigned char *dest, int strideDest) +{ + unsigned int x = get_global_id(0); + unsigned int y = get_global_id(1); + + int Y = bufY[y * strideY + x]; + int U = bufU[(y / 2) * strideU + (x / 2)] - 128; + int V = bufV[(y / 2) * strideV + (x / 2)] - 128; + + __global unsigned char *destPtr = dest + (strideDest * y) + (x * 4); + + /** + * | R | ( | 256 0 403 | | Y | ) + * | G | = ( | 256 -48 -120 | | U - 128 | ) >> 8 + * | B | ( | 256 475 0 | | V - 128 | ) + */ + int y256 = 256 * Y; + destPtr[0] = clamp_uc((y256 + (475 * U)) >> 8, 0, 255); /* B */ + destPtr[1] = clamp_uc((y256 - ( 48 * U) - (120 * V)) >> 8 , 0, 255); /* G */ + destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */ + destPtr[3] = 0xff; /* A */ +} +