From 413c9b8a8529cf0d819600cdc564682f987b85d7 Mon Sep 17 00:00:00 2001 From: akallabeth Date: Wed, 26 Feb 2025 23:00:53 +0100 Subject: [PATCH] [primitives,sse] enable and refactor prefetch code --- libfreerdp/primitives/sse/prim_colors_sse2.c | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/libfreerdp/primitives/sse/prim_colors_sse2.c b/libfreerdp/primitives/sse/prim_colors_sse2.c index 868b03a50..ca441abe5 100644 --- a/libfreerdp/primitives/sse/prim_colors_sse2.c +++ b/libfreerdp/primitives/sse/prim_colors_sse2.c @@ -33,6 +33,8 @@ static primitives_t* generic = NULL; +#define CACHE_LINE_BYTES 64 + /* 1.403 << 14 */ /* -0.344 << 14 */ /* -0.714 << 14 */ @@ -77,6 +79,24 @@ static inline __m128i mm_between_epi16_int(__m128i val, __m128i min, __m128i max #define mm_between_epi16(_val, _min, _max) (_val) = mm_between_epi16_int((_val), (_min), (_max)) +static inline void mm_prefetch_buffer(const void* WINPR_RESTRICT buffer, size_t width, + size_t stride, size_t height) +{ + const size_t srcbump = stride / sizeof(__m128i); + const __m128i* buf = (const __m128i*)buffer; + + for (size_t y = 0; y < height; y++) + { + const __m128i* line = &buf[y * srcbump]; + for (size_t x = 0; x < width * sizeof(INT16) / sizeof(__m128i); + x += (CACHE_LINE_BYTES / sizeof(__m128i))) + { + const char* ptr = (const char*)&line[x]; + _mm_prefetch(ptr, _MM_HINT_NTA); + } + } +} + /*---------------------------------------------------------------------------*/ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, @@ -111,6 +131,11 @@ sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, __m128i c4096 = _mm_set1_epi16(4096); const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i); const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i); + + mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height); + const size_t imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (UINT32 yp = 0; yp < roi->height; ++yp) @@ -203,6 +228,10 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], BYTE* d_buf = pDst; const size_t dstPad = (dstStep - roi->width * 4); + mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height); + for (UINT32 yp = 0; yp < roi->height; ++yp) { for (size_t i = 0; i < imax; i += 2) @@ -368,6 +397,10 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], BYTE* d_buf = pDst; const size_t dstPad = (dstStep - roi->width * 4); + mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height); + for (UINT32 yp = 0; yp < roi->height; ++yp) { for (size_t i = 0; i < imax; i += 2) @@ -572,6 +605,10 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i); const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i); + mm_prefetch_buffer(r_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(g_buf, roi->width, (size_t)srcStep, roi->height); + mm_prefetch_buffer(b_buf, roi->width, (size_t)srcStep, roi->height); + const size_t imax = roi->width * sizeof(INT16) / sizeof(__m128i); for (UINT32 yp = 0; yp < roi->height; ++yp)