diff --git a/libfreerdp/codec/neon/nsc_neon.c b/libfreerdp/codec/neon/nsc_neon.c index 68ca09606..c4504ec63 100644 --- a/libfreerdp/codec/neon/nsc_neon.c +++ b/libfreerdp/codec/neon/nsc_neon.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "../nsc_types.h" @@ -36,5 +37,7 @@ void nsc_init_neon_int(WINPR_ATTR_UNUSED NSC_CONTEXT* WINPR_RESTRICT context) { #if defined(NEON_INTRINSICS_ENABLED) WLog_WARN(TAG, "TODO: Implement neon optimized version of this function"); +#else + WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available"); #endif } diff --git a/libfreerdp/codec/neon/rfx_neon.c b/libfreerdp/codec/neon/rfx_neon.c index 45a8d7a2a..08499541f 100644 --- a/libfreerdp/codec/neon/rfx_neon.c +++ b/libfreerdp/codec/neon/rfx_neon.c @@ -526,7 +526,7 @@ static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp) void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context) { #if defined(NEON_INTRINSICS_ENABLED) - DEBUG_RFX("Using NEON optimizations"); + WLog_VRB(PRIM_TAG, "NEON optimizations"); PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON"); PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_NEON"); PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON"); @@ -534,6 +534,7 @@ void rfx_init_neon_int(RFX_CONTEXT* WINPR_RESTRICT context) context->dwt_2d_decode = rfx_dwt_2d_decode_NEON; context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon; #else + WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or NEON intrinsics not available"); WINPR_UNUSED(context); #endif } diff --git a/libfreerdp/codec/sse/nsc_sse2.c b/libfreerdp/codec/sse/nsc_sse2.c index f792680c3..96847b135 100644 --- a/libfreerdp/codec/sse/nsc_sse2.c +++ b/libfreerdp/codec/sse/nsc_sse2.c @@ -393,9 +393,11 @@ static BOOL nsc_encode_sse2(NSC_CONTEXT* WINPR_RESTRICT context, const BYTE* WIN void nsc_init_sse2_int(NSC_CONTEXT* WINPR_RESTRICT context) { #if defined(SSE_AVX_INTRINSICS_ENABLED) + WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2") context->encode = nsc_encode_sse2; #else + WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available"); WINPR_UNUSED(context); #endif } diff --git a/libfreerdp/codec/sse/rfx_sse2.c b/libfreerdp/codec/sse/rfx_sse2.c index 9f76d38e8..870bee8e6 100644 --- a/libfreerdp/codec/sse/rfx_sse2.c +++ b/libfreerdp/codec/sse/rfx_sse2.c @@ -454,6 +454,7 @@ static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RE void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context) { #if defined(SSE_AVX_INTRINSICS_ENABLED) + WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2") PROFILER_RENAME(context->priv->prof_rfx_quantization_encode, "rfx_quantization_encode_sse2") PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_sse2") @@ -464,5 +465,6 @@ void rfx_init_sse2_int(RFX_CONTEXT* WINPR_RESTRICT context) context->dwt_2d_encode = rfx_dwt_2d_encode_sse2; #else WINPR_UNUSED(context); + WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE2 intrinsics not available"); #endif } diff --git a/libfreerdp/core/simd.h b/libfreerdp/core/simd.h index d3e9f6ba7..3e4998d39 100644 --- a/libfreerdp/core/simd.h +++ b/libfreerdp/core/simd.h @@ -21,6 +21,9 @@ #pragma once #include +#include + +#define PRIM_TAG FREERDP_TAG("primitives") /* https://sourceforge.net/p/predef/wiki/Architectures/ * diff --git a/libfreerdp/primitives/prim_internal.h b/libfreerdp/primitives/prim_internal.h index 3e5bbf62d..63877920d 100644 --- a/libfreerdp/primitives/prim_internal.h +++ b/libfreerdp/primitives/prim_internal.h @@ -23,12 +23,8 @@ #include #include -#include - #include "../core/simd.h" -#define PRIM_TAG FREERDP_TAG("primitives") - #ifdef __GNUC__ #define PRIM_ALIGN_128 __attribute__((aligned(16))) #else diff --git a/libfreerdp/primitives/sse/prim_colors_sse2.c b/libfreerdp/primitives/sse/prim_colors_sse2.c index cbf85f848..fd4794e3c 100644 --- a/libfreerdp/primitives/sse/prim_colors_sse2.c +++ b/libfreerdp/primitives/sse/prim_colors_sse2.c @@ -208,29 +208,29 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], /* The comments below pretend these are 8-byte registers * rather than 16-byte, for readability. */ - __m128i R0 = b1; /* R0 = 00B300B200B100B0 */ - __m128i R1 = b2; /* R1 = 00B700B600B500B4 */ - R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */ - R1 = g1; /* R1 = 00G300G200G100G0 */ - __m128i R2 = g2; /* R2 = 00G700G600G500G4 */ - R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */ - R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ - R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */ - R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */ - R0 = r1; /* R0 = 00R300R200R100R0 */ - __m128i R3 = r2; /* R3 = 00R700R600R500R4 */ - R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */ - R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */ - __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ - R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */ - R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */ - R0 = R4; /* R0 = R4 */ - R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */ - R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */ - R2 = R3; /* R2 = R3 */ - R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */ - R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */ - STORE_SI128(d_buf, R0); /* B1G1R1FFB0G0R0FF */ + __m128i R0 = b1; /* R0 = 00B300B200B100B0 */ + __m128i R1 = b2; /* R1 = 00B700B600B500B4 */ + R0 = _mm_packus_epi16(R0, R1); /* R0 = B7B6B5B4B3B2B1B0 */ + R1 = g1; /* R1 = 00G300G200G100G0 */ + __m128i R2 = g2; /* R2 = 00G700G600G500G4 */ + R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */ + R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ + R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = B3G3B2G2B1G1B0G0 */ + R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = B7G7B6G6B5G5B4G4 */ + R0 = r1; /* R0 = 00R300R200R100R0 */ + __m128i R3 = r2; /* R3 = 00R700R600R500R4 */ + R0 = _mm_packus_epi16(R0, R3); /* R0 = R7R6R5R4R3R2R1R0 */ + R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */ + __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ + R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = R3FFR2FFR1FFR0FF */ + R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = R7FFR6FFR5FFR4FF */ + R0 = R4; /* R0 = R4 */ + R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = B1G1R1FFB0G0R0FF */ + R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = B3G3R3FFB2G2R2FF */ + R2 = R3; /* R2 = R3 */ + R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */ + R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */ + STORE_SI128(d_buf, R0); /* B1G1R1FFB0G0R0FF */ d_buf += sizeof(__m128i); STORE_SI128(d_buf, R4); /* B3G3R3FFB2G2R2FF */ d_buf += sizeof(__m128i); @@ -377,29 +377,29 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], /* The comments below pretend these are 8-byte registers * rather than 16-byte, for readability. */ - __m128i R0 = r1; /* R0 = 00R300R200R100R0 */ - __m128i R1 = r2; /* R1 = 00R700R600R500R4 */ - R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */ - R1 = g1; /* R1 = 00G300G200G100G0 */ - __m128i R2 = g2; /* R2 = 00G700G600G500G4 */ - R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */ - R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ - R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */ - R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */ - R0 = b1; /* R0 = 00B300B200B100B0 */ - __m128i R3 = b2; /* R3 = 00B700B600B500B4 */ - R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */ - R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */ - __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ - R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */ - R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */ - R0 = R4; /* R0 = R4 */ - R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */ - R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */ - R2 = R3; /* R2 = R3 */ - R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */ - R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */ - STORE_SI128(d_buf, R0); /* R1G1B1FFR0G0B0FF */ + __m128i R0 = r1; /* R0 = 00R300R200R100R0 */ + __m128i R1 = r2; /* R1 = 00R700R600R500R4 */ + R0 = _mm_packus_epi16(R0, R1); /* R0 = R7R6R5R4R3R2R1R0 */ + R1 = g1; /* R1 = 00G300G200G100G0 */ + __m128i R2 = g2; /* R2 = 00G700G600G500G4 */ + R1 = _mm_packus_epi16(R1, R2); /* R1 = G7G6G5G4G3G2G1G0 */ + R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ + R2 = _mm_unpacklo_epi8(R0, R2); /* R2 = R3G3R2G2R1G1R0G0 */ + R1 = _mm_unpackhi_epi8(R0, R1); /* R1 = R7G7R6G6R5G5R4G4 */ + R0 = b1; /* R0 = 00B300B200B100B0 */ + __m128i R3 = b2; /* R3 = 00B700B600B500B4 */ + R0 = _mm_packus_epi16(R0, R3); /* R0 = B7B6B5B4B3B2B1B0 */ + R3 = mm_set1_epu32(0xFFFFFFFFU); /* R3 = FFFFFFFFFFFFFFFF */ + __m128i R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ + R4 = _mm_unpacklo_epi8(R0, R4); /* R4 = B3FFB2FFB1FFB0FF */ + R3 = _mm_unpackhi_epi8(R0, R3); /* R3 = B7FFB6FFB5FFB4FF */ + R0 = R4; /* R0 = R4 */ + R0 = _mm_unpacklo_epi16(R2, R0); /* R0 = R1G1B1FFR0G0B0FF */ + R4 = _mm_unpackhi_epi16(R2, R4); /* R4 = R3G3B3FFR2G2B2FF */ + R2 = R3; /* R2 = R3 */ + R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */ + R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */ + STORE_SI128(d_buf, R0); /* R1G1B1FFR0G0B0FF */ d_buf += sizeof(__m128i); STORE_SI128(d_buf, R4); /* R3G3B3FFR2G2B2FF */ d_buf += sizeof(__m128i); @@ -441,7 +441,7 @@ sse2_yCbCrToRGB_16s8u_P3AC4R(const INT16* WINPR_RESTRICT pSrc[3], UINT32 srcStep const prim_size_t* WINPR_RESTRICT roi) /* region of interest */ { if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) || - ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst)&0x0f) || (srcStep & 0x0f) || + ((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst) & 0x0f) || (srcStep & 0x0f) || (dstStep & 0x0f)) { /* We can't maintain 16-byte alignment. */ @@ -1044,7 +1044,7 @@ void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims) #if defined(SSE_AVX_INTRINSICS_ENABLED) generic = primitives_get_generic(); - WLog_VRB(PRIM_TAG, "SSE2 optimizations"); + WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R; prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R; prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3; diff --git a/libfreerdp/primitives/sse/prim_set_sse2.c b/libfreerdp/primitives/sse/prim_set_sse2.c index 80c0ed323..a1f52a590 100644 --- a/libfreerdp/primitives/sse/prim_set_sse2.c +++ b/libfreerdp/primitives/sse/prim_set_sse2.c @@ -223,7 +223,7 @@ void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims) /* Pick tuned versions if possible. */ - WLog_VRB(PRIM_TAG, "SSE2 optimizations"); + WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); prims->set_8u = sse2_set_8u; prims->set_32s = sse2_set_32s; prims->set_32u = sse2_set_32u;