diff --git a/libfreerdp/codec/CMakeLists.txt b/libfreerdp/codec/CMakeLists.txt index a76d6a270..10dac8c5c 100644 --- a/libfreerdp/codec/CMakeLists.txt +++ b/libfreerdp/codec/CMakeLists.txt @@ -41,20 +41,20 @@ set(CODEC_SRCS yuv.c ) -set(CODEC_SSE2_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h) +set(CODEC_SSE3_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h) set(CODEC_NEON_SRCS neon/rfx_neon.c neon/rfx_neon.h neon/nsc_neon.c neon/nsc_neon.h) # Append initializers set(CODEC_LIBS "") -list(APPEND CODEC_SRCS ${CODEC_SSE2_SRCS}) +list(APPEND CODEC_SRCS ${CODEC_SSE3_SRCS}) list(APPEND CODEC_SRCS ${CODEC_NEON_SRCS}) include(CompilerDetect) include(DetectIntrinsicSupport) if(WITH_SIMD) - set_simd_source_file_properties("sse2" ${CODEC_SSE2_SRCS}) + set_simd_source_file_properties("sse3" ${CODEC_SSE3_SRCS}) set_simd_source_file_properties("neon" ${CODEC_NEON_SRCS}) endif() diff --git a/libfreerdp/codec/sse/nsc_sse2.c b/libfreerdp/codec/sse/nsc_sse2.c index e35998884..f3ef4ce5a 100644 --- a/libfreerdp/codec/sse/nsc_sse2.c +++ b/libfreerdp/codec/sse/nsc_sse2.c @@ -26,6 +26,7 @@ #include "nsc_sse2.h" #include "../../core/simd.h" +#include "../../primitives/sse/prim_avxsse.h" #if defined(SSE_AVX_INTRINSICS_ENABLED) #include @@ -290,13 +291,13 @@ static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, const BYTE* dat cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1)); cg_val = _mm_srai_epi16(cg_val, ccl); y_val = _mm_packus_epi16(y_val, y_val); - _mm_storeu_si128((__m128i*)yplane, y_val); + STORE_SI128(yplane, y_val); co_val = _mm_packs_epi16(co_val, co_val); - _mm_storeu_si128((__m128i*)coplane, co_val); + STORE_SI128(coplane, co_val); cg_val = _mm_packs_epi16(cg_val, cg_val); - _mm_storeu_si128((__m128i*)cgplane, cg_val); + STORE_SI128(cgplane, cg_val); a_val = _mm_packus_epi16(a_val, a_val); - _mm_storeu_si128((__m128i*)aplane, a_val); + STORE_SI128(aplane, a_val); yplane += 8; coplane += 8; cgplane += 8; @@ -354,21 +355,21 @@ static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context) for (UINT32 x = 0; x < tempWidth >> 1; x += 8) { - t = _mm_loadu_si128((__m128i*)co_src0); - t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)co_src1)); + t = LOAD_SI128(co_src0); + t = _mm_avg_epu8(t, LOAD_SI128(co_src1)); val = _mm_and_si128(_mm_srli_si128(t, 1), mask); val = _mm_avg_epu16(val, _mm_and_si128(t, mask)); val = _mm_packus_epi16(val, val); - _mm_storeu_si128((__m128i*)co_dst, val); + STORE_SI128(co_dst, val); co_dst += 8; co_src0 += 16; co_src1 += 16; - t = _mm_loadu_si128((__m128i*)cg_src0); - t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)cg_src1)); + t = LOAD_SI128(cg_src0); + t = _mm_avg_epu8(t, LOAD_SI128(cg_src1)); val = _mm_and_si128(_mm_srli_si128(t, 1), mask); val = _mm_avg_epu16(val, _mm_and_si128(t, mask)); val = _mm_packus_epi16(val, val); - _mm_storeu_si128((__m128i*)cg_dst, val); + STORE_SI128(cg_dst, val); cg_dst += 8; cg_src0 += 16; cg_src1 += 16; @@ -391,7 +392,8 @@ static BOOL nsc_encode_sse2(NSC_CONTEXT* context, const BYTE* data, UINT32 scanl void nsc_init_sse2(NSC_CONTEXT* context) { #if defined(SSE_AVX_INTRINSICS_ENABLED) - if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE)) + if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) || + !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) return; PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2") diff --git a/libfreerdp/codec/sse/rfx_sse2.c b/libfreerdp/codec/sse/rfx_sse2.c index af0a6b50d..6eee9c957 100644 --- a/libfreerdp/codec/sse/rfx_sse2.c +++ b/libfreerdp/codec/sse/rfx_sse2.c @@ -27,6 +27,7 @@ #include "rfx_sse2.h" #include "../../core/simd.h" +#include "../../primitives/sse/prim_avxsse.h" #if defined(SSE_AVX_INTRINSICS_ENABLED) #include @@ -75,10 +76,10 @@ rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t bu do { - const __m128i la = _mm_load_si128(ptr); + const __m128i la = LOAD_SI128(ptr); const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(int, factor)); - _mm_store_si128(ptr, a); + STORE_SI128(ptr, a); ptr++; } while (ptr < buf_end); } @@ -116,10 +117,10 @@ rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const unsigned do { - const __m128i la = _mm_load_si128(ptr); + const __m128i la = LOAD_SI128(ptr); __m128i a = _mm_add_epi16(la, half); a = _mm_srai_epi16(a, factor); - _mm_store_si128(ptr, a); + STORE_SI128(ptr, a); ptr++; } while (ptr < buf_end); } @@ -177,9 +178,9 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC for (size_t n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ - __m128i l_n = _mm_load_si128((__m128i*)l_ptr); - __m128i h_n = _mm_load_si128((__m128i*)h_ptr); - __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1)); + __m128i l_n = LOAD_SI128(l_ptr); + __m128i h_n = LOAD_SI128(h_ptr); + __m128i h_n_m = LOAD_SI128(h_ptr - 1); if (n == 0) { @@ -191,7 +192,7 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); - _mm_store_si128((__m128i*)l_ptr, dst_n); + STORE_SI128(l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } @@ -203,10 +204,10 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC for (size_t n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ - __m128i h_n = _mm_load_si128((__m128i*)h_ptr); + __m128i h_n = LOAD_SI128(h_ptr); h_n = _mm_slli_epi16(h_n, 1); - __m128i dst_n = _mm_load_si128((__m128i*)(l_ptr)); - __m128i dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1)); + __m128i dst_n = LOAD_SI128(l_ptr); + __m128i dst_n_p = LOAD_SI128(l_ptr + 1); if (n == subband_width - 8) { @@ -219,8 +220,8 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); - _mm_store_si128((__m128i*)dst_ptr, dst1); - _mm_store_si128((__m128i*)(dst_ptr + 8), dst2); + STORE_SI128(dst_ptr, dst1); + STORE_SI128(dst_ptr + 8, dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; @@ -243,21 +244,21 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT for (size_t x = 0; x < total_width; x += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ - const __m128i l_n = _mm_load_si128((__m128i*)l_ptr); - const __m128i h_n = _mm_load_si128((__m128i*)h_ptr); + const __m128i l_n = LOAD_SI128(l_ptr); + const __m128i h_n = LOAD_SI128(h_ptr); __m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1)); if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { - const __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width)); + const __m128i h_n_m = LOAD_SI128(h_ptr - total_width); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n); - _mm_store_si128((__m128i*)dst_ptr, dst_n); + STORE_SI128(dst_ptr, dst_n); l_ptr += 8; h_ptr += 8; dst_ptr += 8; @@ -275,8 +276,8 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT for (size_t x = 0; x < total_width; x += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ - __m128i h_n = _mm_load_si128((__m128i*)h_ptr); - __m128i dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width)); + __m128i h_n = LOAD_SI128(h_ptr); + __m128i dst_n_m = LOAD_SI128(dst_ptr - total_width); h_n = _mm_slli_epi16(h_n, 1); __m128i tmp_n = dst_n_m; @@ -284,13 +285,13 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { - const __m128i dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width)); + const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); const __m128i dst_n = _mm_add_epi16(tmp_n, h_n); - _mm_store_si128((__m128i*)dst_ptr, dst_n); + STORE_SI128(dst_ptr, dst_n); h_ptr += 8; dst_ptr += 8; } @@ -342,29 +343,29 @@ rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRI { for (size_t x = 0; x < total_width; x += 8) { - __m128i src_2n = _mm_load_si128((__m128i*)src); - __m128i src_2n_1 = _mm_load_si128((__m128i*)(src + total_width)); + __m128i src_2n = LOAD_SI128(src); + __m128i src_2n_1 = LOAD_SI128(src + total_width); __m128i src_2n_2 = src_2n; if (n < subband_width - 1) - src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width)); + src_2n_2 = LOAD_SI128(src + 2ULL * total_width); /* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */ __m128i h_n = _mm_add_epi16(src_2n, src_2n_2); h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); - _mm_store_si128((__m128i*)h, h_n); + STORE_SI128(h, h_n); __m128i h_n_m = h_n; if (n != 0) - h_n_m = _mm_load_si128((__m128i*)(h - total_width)); + h_n_m = LOAD_SI128(h - total_width); /* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */ __m128i l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); - _mm_store_si128((__m128i*)l, l_n); + STORE_SI128(l, l_n); src += 8; l += 8; h += 8; @@ -396,8 +397,8 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR h_n = _mm_srai_epi16(h_n, 1); h_n = _mm_sub_epi16(src_2n_1, h_n); h_n = _mm_srai_epi16(h_n, 1); - _mm_store_si128((__m128i*)h, h_n); - __m128i h_n_m = _mm_loadu_si128((__m128i*)(h - 1)); + STORE_SI128(h, h_n); + __m128i h_n_m = LOAD_SI128(h - 1); if (n == 0) { @@ -409,7 +410,7 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR __m128i l_n = _mm_add_epi16(h_n_m, h_n); l_n = _mm_srai_epi16(l_n, 1); l_n = _mm_add_epi16(l_n, src_2n); - _mm_store_si128((__m128i*)l, l_n); + STORE_SI128(l, l_n); src += 16; l += 8; h += 8; @@ -453,7 +454,8 @@ static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RE void rfx_init_sse2(RFX_CONTEXT* context) { #if defined(SSE_AVX_INTRINSICS_ENABLED) - if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE)) + if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) || + !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) return; PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2") diff --git a/libfreerdp/primitives/CMakeLists.txt b/libfreerdp/primitives/CMakeLists.txt index 9957b055c..314f9d7c8 100644 --- a/libfreerdp/primitives/CMakeLists.txt +++ b/libfreerdp/primitives/CMakeLists.txt @@ -25,9 +25,16 @@ set(PRIMITIVES_SRCS prim_internal.h ) -set(PRIMITIVES_SSE2_SRCS sse/prim_colors_sse2.c sse/prim_set_sse2.c sse/prim_avxsse.h sse/prim_templates.h) - -set(PRIMITIVES_SSE3_SRCS sse/prim_add_sse3.c sse/prim_alphaComp_sse3.c sse/prim_andor_sse3.c sse/prim_shift_sse3.c) +set(PRIMITIVES_SSE3_SRCS + sse/prim_avxsse.h + sse/prim_templates.h + sse/prim_colors_sse2.c + sse/prim_set_sse2.c + sse/prim_add_sse3.c + sse/prim_alphaComp_sse3.c + sse/prim_andor_sse3.c + sse/prim_shift_sse3.c +) set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c) @@ -58,14 +65,8 @@ if(WITH_OPENCL) freerdp_library_add(OpenCL::OpenCL) endif() -set(PRIMITIVES_OPT_SRCS - ${PRIMITIVES_NEON_SRCS} - ${PRIMITIVES_SSE2_SRCS} - ${PRIMITIVES_SSE3_SRCS} - ${PRIMITIVES_SSSE3_SRCS} - ${PRIMITIVES_SSE4_1_SRCS} - ${PRIMITIVES_SSE4_2_SRCS} - ${PRIMITIVES_OPENCL_SRCS} +set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS} + ${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS} ) if(WITH_AVX2) @@ -80,7 +81,6 @@ add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS}) include(CompilerDetect) include(DetectIntrinsicSupport) if(WITH_SIMD) - set_simd_source_file_properties("sse2" ${PRIMITIVES_SSE2_SRCS}) set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS}) set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS}) set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS}) diff --git a/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c index c2ad2a632..f9bf13cfa 100644 --- a/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c +++ b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c @@ -73,7 +73,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr for (UINT32 h = 0; h < height; h++) { UINT32 w = width; - BOOL onStride = 0; /* Get to a 16-byte destination boundary. */ if ((ULONG_PTR)dptr & 0x0f) @@ -96,9 +95,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr w -= startup; } - /* Each loop handles eight pixels at a time. */ - onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE; - while (w >= 8) { __m128i R0; @@ -110,22 +106,10 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr __m128i R6; __m128i R7; - if (onStride) - { - /* The faster path, 16-byte aligned load. */ - R0 = _mm_load_si128((const __m128i*)sptr); - sptr += (128 / 8); - R1 = _mm_load_si128((const __m128i*)sptr); - sptr += (128 / 8); - } - else - { - /* Off-stride, slower LDDQU load. */ - R0 = _mm_lddqu_si128((const __m128i*)sptr); - sptr += (128 / 8); - R1 = _mm_lddqu_si128((const __m128i*)sptr); - sptr += (128 / 8); - } + R0 = LOAD_SI128(sptr); + sptr += (128 / 8); + R1 = LOAD_SI128(sptr); + sptr += (128 / 8); /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ @@ -197,9 +181,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ R5 = _mm_unpackhi_epi16(R2, R3); /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ - _mm_store_si128((__m128i*)dptr, R4); + STORE_SI128(dptr, R4); dptr += (128 / 8); - _mm_store_si128((__m128i*)dptr, R5); + STORE_SI128(dptr, R5); dptr += (128 / 8); w -= 8; } @@ -262,7 +246,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT for (UINT32 h = 0; h < height; h++) { UINT32 w = width; - BOOL onStride = 0; /* Get to a 16-byte destination boundary. */ if ((ULONG_PTR)dptr & 0x0f) @@ -285,47 +268,26 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT w -= startup; } - /* Each loop handles eight pixels at a time. */ - onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE; - while (w >= 8) { - __m128i R0; - __m128i R1; - __m128i R2; - __m128i R3; - __m128i R4; - __m128i R5; - __m128i R6; __m128i R7; - if (onStride) - { - /* The faster path, 16-byte aligned load. */ - R0 = _mm_load_si128((const __m128i*)sptr); - sptr += (128 / 8); - R1 = _mm_load_si128((const __m128i*)sptr); - sptr += (128 / 8); - } - else - { - /* Off-stride, slower LDDQU load. */ - R0 = _mm_lddqu_si128((const __m128i*)sptr); - sptr += (128 / 8); - R1 = _mm_lddqu_si128((const __m128i*)sptr); - sptr += (128 / 8); - } + /* The faster path, 16-byte aligned load. */ + __m128i R0 = LOAD_SI128(sptr); + sptr += (128 / 8); + __m128i R1 = LOAD_SI128(sptr); + sptr += (128 / 8); /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */ /* Shuffle to pack all the like types together. */ - R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); - R3 = _mm_shuffle_epi8(R0, R2); - R4 = _mm_shuffle_epi8(R1, R2); + __m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); + __m128i R3 = _mm_shuffle_epi8(R0, R2); + __m128i R4 = _mm_shuffle_epi8(R1, R2); /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */ /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */ - R5 = _mm_unpackhi_epi32(R3, R4); - R6 = _mm_unpacklo_epi32(R3, R4); + __m128i R5 = _mm_unpackhi_epi32(R3, R4); + __m128i R6 = _mm_unpacklo_epi32(R3, R4); /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */ /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */ @@ -390,9 +352,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */ R5 = _mm_unpackhi_epi16(R2, R3); /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */ - _mm_store_si128((__m128i*)dptr, R4); + STORE_SI128(dptr, R4); dptr += (128 / 8); - _mm_store_si128((__m128i*)dptr, R5); + STORE_SI128(dptr, R5); dptr += (128 / 8); w -= 8; } @@ -456,6 +418,7 @@ void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims) primitives_init_YCoCg(prims); if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) && + IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations"); diff --git a/libfreerdp/primitives/sse/prim_YUV_sse4.1.c b/libfreerdp/primitives/sse/prim_YUV_sse4.1.c index 620219f14..db5b5251a 100644 --- a/libfreerdp/primitives/sse/prim_YUV_sse4.1.c +++ b/libfreerdp/primitives/sse/prim_YUV_sse4.1.c @@ -28,6 +28,7 @@ #include #include +#include "prim_internal.h" #include "prim_avxsse.h" #include "prim_YUV.h" @@ -56,7 +57,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080), mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) }; const __m128i c128 = _mm_set1_epi16(128); - __m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst), + __m128i BGRX = _mm_and_si128(LOAD_SI128(dst), mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)); { __m128i C; @@ -117,7 +118,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr BGRX = _mm_or_si128(BGRX, packed); } } - _mm_storeu_si128(dst++, BGRX); + STORE_SI128(dst++, BGRX); return dst; } @@ -140,9 +141,9 @@ static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[] for (UINT32 x = 0; x < nWidth - pad; x += 16) { - const __m128i Y = _mm_loadu_si128((const __m128i*)YData); - const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData); - const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData); + const __m128i Y = LOAD_SI128(YData); + const __m128i uRaw = LOAD_SI128(UData); + const __m128i vRaw = LOAD_SI128(VData); const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); YData += 16; @@ -445,12 +446,9 @@ static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW( size_t x = 0; for (; x < nWidth - pad; x += 16) { - const __m128i Y[] = { _mm_loadu_si128((const __m128i*)&YData[0][x]), - _mm_loadu_si128((const __m128i*)&YData[1][x]) }; - __m128i U[] = { _mm_loadu_si128((const __m128i*)&UData[0][x]), - _mm_loadu_si128((const __m128i*)&UData[1][x]) }; - __m128i V[] = { _mm_loadu_si128((const __m128i*)&VData[0][x]), - _mm_loadu_si128((const __m128i*)&VData[1][x]) }; + const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) }; + __m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) }; + __m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) }; BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] }; sse41_BGRX_fillRGB(dstp, Y, U, V); @@ -636,21 +634,21 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE for (; x < width - width % 16; x += 16) { /* store 16 rgba pixels in 4 128 bit registers */ - __m128i x0 = _mm_loadu_si128(argb++); // 1st 4 pixels + __m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels { x0 = _mm_maddubs_epi16(x0, y_factors); - __m128i x1 = _mm_loadu_si128(argb++); // 2nd 4 pixels + __m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels x1 = _mm_maddubs_epi16(x1, y_factors); x0 = _mm_hadds_epi16(x0, x1); x0 = _mm_srli_epi16(x0, Y_SHIFT); } - __m128i x2 = _mm_loadu_si128(argb++); // 3rd 4 pixels + __m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels { x2 = _mm_maddubs_epi16(x2, y_factors); - __m128i x3 = _mm_loadu_si128(argb++); // 4th 4 pixels + __m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels x3 = _mm_maddubs_epi16(x3, y_factors); x2 = _mm_hadds_epi16(x2, x3); x2 = _mm_srli_epi16(x2, Y_SHIFT); @@ -658,7 +656,7 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE x0 = _mm_packus_epi16(x0, x2); /* save to y plane */ - _mm_storeu_si128(ydst++, x0); + STORE_SI128(ydst++, x0); } for (; x < width; x++) @@ -688,20 +686,20 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1, __m64* vdst = (__m64*)&dst2[x / 2]; /* subsample 16x2 pixels into 16x1 pixels */ - __m128i x0 = _mm_loadu_si128(&rgb1[0]); - __m128i x4 = _mm_loadu_si128(&rgb2[0]); + __m128i x0 = LOAD_SI128(&rgb1[0]); + __m128i x4 = LOAD_SI128(&rgb2[0]); x0 = _mm_avg_epu8(x0, x4); - __m128i x1 = _mm_loadu_si128(&rgb1[1]); - x4 = _mm_loadu_si128(&rgb2[1]); + __m128i x1 = LOAD_SI128(&rgb1[1]); + x4 = LOAD_SI128(&rgb2[1]); x1 = _mm_avg_epu8(x1, x4); - __m128i x2 = _mm_loadu_si128(&rgb1[2]); - x4 = _mm_loadu_si128(&rgb2[2]); + __m128i x2 = LOAD_SI128(&rgb1[2]); + x4 = LOAD_SI128(&rgb2[2]); x2 = _mm_avg_epu8(x2, x4); - __m128i x3 = _mm_loadu_si128(&rgb1[3]); - x4 = _mm_loadu_si128(&rgb2[3]); + __m128i x3 = LOAD_SI128(&rgb1[3]); + x4 = LOAD_SI128(&rgb2[3]); x3 = _mm_avg_epu8(x3, x4); /* subsample these 16x1 pixels into 8x1 pixels */ @@ -827,14 +825,14 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW( for (; x < width - width % 16; x += 16) { /* store 16 rgba pixels in 4 128 bit registers */ - const __m128i xe1 = _mm_loadu_si128(argbEven++); // 1st 4 pixels - const __m128i xe2 = _mm_loadu_si128(argbEven++); // 2nd 4 pixels - const __m128i xe3 = _mm_loadu_si128(argbEven++); // 3rd 4 pixels - const __m128i xe4 = _mm_loadu_si128(argbEven++); // 4th 4 pixels - const __m128i xo1 = _mm_loadu_si128(argbOdd++); // 1st 4 pixels - const __m128i xo2 = _mm_loadu_si128(argbOdd++); // 2nd 4 pixels - const __m128i xo3 = _mm_loadu_si128(argbOdd++); // 3rd 4 pixels - const __m128i xo4 = _mm_loadu_si128(argbOdd++); // 4th 4 pixels + const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels + const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels + const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels + const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels + const __m128i xo1 = LOAD_SI128(argbOdd++); // 1st 4 pixels + const __m128i xo2 = LOAD_SI128(argbOdd++); // 2nd 4 pixels + const __m128i xo3 = LOAD_SI128(argbOdd++); // 3rd 4 pixels + const __m128i xo4 = LOAD_SI128(argbOdd++); // 4th 4 pixels { /* Y: multiplications with subtotals and horizontal sums */ const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors), @@ -852,12 +850,12 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW( Y_SHIFT); const __m128i yo = _mm_packus_epi16(yo1, yo2); /* store y [b1] */ - _mm_storeu_si128((__m128i*)b1Even, ye); + STORE_SI128(b1Even, ye); b1Even += 16; if (b1Odd) { - _mm_storeu_si128((__m128i*)b1Odd, yo); + STORE_SI128(b1Odd, yo); b1Odd += 16; } } @@ -925,7 +923,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW( if (b1Odd) /* b4 */ { - _mm_storeu_si128((__m128i*)b4, uo); + STORE_SI128(b4, uo); b4 += 16; } @@ -1003,7 +1001,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW( if (b1Odd) /* b5 */ { - _mm_storeu_si128((__m128i*)b5, vo); + STORE_SI128(b5, vo); b5 += 16; } @@ -1117,14 +1115,14 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( /* store 16 rgba pixels in 4 128 bit registers * for even and odd rows. */ - const __m128i xe1 = _mm_loadu_si128(argbEven++); /* 1st 4 pixels */ - const __m128i xe2 = _mm_loadu_si128(argbEven++); /* 2nd 4 pixels */ - const __m128i xe3 = _mm_loadu_si128(argbEven++); /* 3rd 4 pixels */ - const __m128i xe4 = _mm_loadu_si128(argbEven++); /* 4th 4 pixels */ - const __m128i xo1 = _mm_loadu_si128(argbOdd++); /* 1st 4 pixels */ - const __m128i xo2 = _mm_loadu_si128(argbOdd++); /* 2nd 4 pixels */ - const __m128i xo3 = _mm_loadu_si128(argbOdd++); /* 3rd 4 pixels */ - const __m128i xo4 = _mm_loadu_si128(argbOdd++); /* 4th 4 pixels */ + const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */ + const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */ + const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */ + const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */ + const __m128i xo1 = LOAD_SI128(argbOdd++); /* 1st 4 pixels */ + const __m128i xo2 = LOAD_SI128(argbOdd++); /* 2nd 4 pixels */ + const __m128i xo3 = LOAD_SI128(argbOdd++); /* 3rd 4 pixels */ + const __m128i xo4 = LOAD_SI128(argbOdd++); /* 4th 4 pixels */ { /* Y: multiplications with subtotals and horizontal sums */ const __m128i y_factors = BGRX_Y_FACTORS; @@ -1136,7 +1134,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( Y_SHIFT); const __m128i ye = _mm_packus_epi16(ye1, ye2); /* store y [b1] */ - _mm_storeu_si128((__m128i*)yLumaDstEven, ye); + STORE_SI128(yLumaDstEven, ye); yLumaDstEven += 16; } @@ -1150,7 +1148,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW( _mm_maddubs_epi16(xo4, y_factors)), Y_SHIFT); const __m128i yo = _mm_packus_epi16(yo1, yo2); - _mm_storeu_si128((__m128i*)yLumaDstOdd, yo); + STORE_SI128(yLumaDstOdd, yo); yLumaDstOdd += 16; } @@ -1470,22 +1468,22 @@ static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const const __m128i unpackLow = _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); { - const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]); + const __m128i u = LOAD_SI128(&Um[x]); const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); - _mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh); - _mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow); - _mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh); - _mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow); + STORE_SI128(&pU[2ULL * x], uHigh); + STORE_SI128(&pU[2ULL * x + 16], uLow); + STORE_SI128(&pU1[2ULL * x], uHigh); + STORE_SI128(&pU1[2ULL * x + 16], uLow); } { - const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]); + const __m128i u = LOAD_SI128(&Vm[x]); const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); - _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow); - _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow); + STORE_SI128(&pV[2 * x], uHigh); + STORE_SI128(&pV[2 * x + 16], uLow); + STORE_SI128(&pV1[2 * x], uHigh); + STORE_SI128(&pV1[2 * x + 16], uLow); } } @@ -1578,14 +1576,14 @@ static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3], for (; x < halfWidth - halfPad; x += 16) { { - const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]); + const __m128i u = LOAD_SI128(&Ua[x]); const __m128i u2 = _mm_unpackhi_epi8(u, zero); const __m128i u1 = _mm_unpacklo_epi8(u, zero); _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); } { - const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]); + const __m128i u = LOAD_SI128(&Va[x]); const __m128i u2 = _mm_unpackhi_epi8(u, zero); const __m128i u1 = _mm_unpacklo_epi8(u, zero); _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); @@ -1641,14 +1639,14 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons for (; x < halfWidth - halfPad; x += 16) { { - const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]); + const __m128i u = LOAD_SI128(&pYaU[x]); const __m128i u2 = _mm_unpackhi_epi8(zero, u); const __m128i u1 = _mm_unpacklo_epi8(zero, u); _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); } { - const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]); + const __m128i v = LOAD_SI128(&pYaV[x]); const __m128i v2 = _mm_unpackhi_epi8(zero, v); const __m128i v1 = _mm_unpacklo_epi8(zero, v); _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); @@ -1678,8 +1676,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons for (; x < quaterWidth - quaterPad; x += 16) { { - const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]); - const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]); + const __m128i uU = LOAD_SI128(&pUaU[x]); + const __m128i uV = LOAD_SI128(&pVaU[x]); const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); const __m128i uLow = _mm_unpacklo_epi8(uU, uV); const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); @@ -1692,8 +1690,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); } { - const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]); - const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]); + const __m128i vU = LOAD_SI128(&pUaV[x]); + const __m128i vV = LOAD_SI128(&pVaV[x]); const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); const __m128i vLow = _mm_unpacklo_epi8(vU, vV); const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); diff --git a/libfreerdp/primitives/sse/prim_add_sse3.c b/libfreerdp/primitives/sse/prim_add_sse3.c index 24e17b302..67908bc2b 100644 --- a/libfreerdp/primitives/sse/prim_add_sse3.c +++ b/libfreerdp/primitives/sse/prim_add_sse3.c @@ -75,29 +75,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, __m128i* vdptr1 = (__m128i*)dptr1; __m128i* vdptr2 = (__m128i*)dptr2; - __m128i xmm0 = _mm_lddqu_si128(vsptr1++); - __m128i xmm1 = _mm_lddqu_si128(vsptr1++); - __m128i xmm2 = _mm_lddqu_si128(vsptr1++); - __m128i xmm3 = _mm_lddqu_si128(vsptr1++); - __m128i xmm4 = _mm_lddqu_si128(vsptr2++); - __m128i xmm5 = _mm_lddqu_si128(vsptr2++); - __m128i xmm6 = _mm_lddqu_si128(vsptr2++); - __m128i xmm7 = _mm_lddqu_si128(vsptr2++); + __m128i xmm0 = LOAD_SI128(vsptr1++); + __m128i xmm1 = LOAD_SI128(vsptr1++); + __m128i xmm2 = LOAD_SI128(vsptr1++); + __m128i xmm3 = LOAD_SI128(vsptr1++); + __m128i xmm4 = LOAD_SI128(vsptr2++); + __m128i xmm5 = LOAD_SI128(vsptr2++); + __m128i xmm6 = LOAD_SI128(vsptr2++); + __m128i xmm7 = LOAD_SI128(vsptr2++); xmm0 = _mm_adds_epi16(xmm0, xmm4); xmm1 = _mm_adds_epi16(xmm1, xmm5); xmm2 = _mm_adds_epi16(xmm2, xmm6); xmm3 = _mm_adds_epi16(xmm3, xmm7); - _mm_store_si128(vdptr1++, xmm0); - _mm_store_si128(vdptr1++, xmm1); - _mm_store_si128(vdptr1++, xmm2); - _mm_store_si128(vdptr1++, xmm3); + STORE_SI128(vdptr1++, xmm0); + STORE_SI128(vdptr1++, xmm1); + STORE_SI128(vdptr1++, xmm2); + STORE_SI128(vdptr1++, xmm3); - _mm_store_si128(vdptr2++, xmm0); - _mm_store_si128(vdptr2++, xmm1); - _mm_store_si128(vdptr2++, xmm2); - _mm_store_si128(vdptr2++, xmm3); + STORE_SI128(vdptr2++, xmm0); + STORE_SI128(vdptr2++, xmm1); + STORE_SI128(vdptr2++, xmm2); + STORE_SI128(vdptr2++, xmm3); dptr1 = (INT16*)vdptr1; dptr2 = (INT16*)vdptr2; @@ -113,29 +113,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, __m128i* vdptr1 = (__m128i*)dptr1; __m128i* vdptr2 = (__m128i*)dptr2; - __m128i xmm0 = _mm_load_si128(vsptr1++); - __m128i xmm1 = _mm_load_si128(vsptr1++); - __m128i xmm2 = _mm_load_si128(vsptr1++); - __m128i xmm3 = _mm_load_si128(vsptr1++); - __m128i xmm4 = _mm_load_si128(vsptr2++); - __m128i xmm5 = _mm_load_si128(vsptr2++); - __m128i xmm6 = _mm_load_si128(vsptr2++); - __m128i xmm7 = _mm_load_si128(vsptr2++); + __m128i xmm0 = LOAD_SI128(vsptr1++); + __m128i xmm1 = LOAD_SI128(vsptr1++); + __m128i xmm2 = LOAD_SI128(vsptr1++); + __m128i xmm3 = LOAD_SI128(vsptr1++); + __m128i xmm4 = LOAD_SI128(vsptr2++); + __m128i xmm5 = LOAD_SI128(vsptr2++); + __m128i xmm6 = LOAD_SI128(vsptr2++); + __m128i xmm7 = LOAD_SI128(vsptr2++); xmm0 = _mm_adds_epi16(xmm0, xmm4); xmm1 = _mm_adds_epi16(xmm1, xmm5); xmm2 = _mm_adds_epi16(xmm2, xmm6); xmm3 = _mm_adds_epi16(xmm3, xmm7); - _mm_store_si128(vdptr1++, xmm0); - _mm_store_si128(vdptr1++, xmm1); - _mm_store_si128(vdptr1++, xmm2); - _mm_store_si128(vdptr1++, xmm3); + STORE_SI128(vdptr1++, xmm0); + STORE_SI128(vdptr1++, xmm1); + STORE_SI128(vdptr1++, xmm2); + STORE_SI128(vdptr1++, xmm3); - _mm_store_si128(vdptr2++, xmm0); - _mm_store_si128(vdptr2++, xmm1); - _mm_store_si128(vdptr2++, xmm2); - _mm_store_si128(vdptr2++, xmm3); + STORE_SI128(vdptr2++, xmm0); + STORE_SI128(vdptr2++, xmm1); + STORE_SI128(vdptr2++, xmm2); + STORE_SI128(vdptr2++, xmm3); dptr1 = (INT16*)vdptr1; dptr2 = (INT16*)vdptr2; @@ -156,8 +156,8 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, xmm0 = _mm_adds_epi16(xmm0, xmm1); - _mm_store_si128(vdptr1++, xmm0); - _mm_store_si128(vdptr2++, xmm0); + STORE_SI128(vdptr1++, xmm0); + STORE_SI128(vdptr2++, xmm0); dptr1 = (INT16*)vdptr1; dptr2 = (INT16*)vdptr2; diff --git a/libfreerdp/primitives/sse/prim_alphaComp_sse3.c b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c index e2b327608..f48e66ec3 100644 --- a/libfreerdp/primitives/sse/prim_alphaComp_sse3.c +++ b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c @@ -28,6 +28,7 @@ #include "prim_alphaComp.h" +#include "prim_internal.h" #include "prim_avxsse.h" /* ------------------------------------------------------------------------- */ @@ -171,7 +172,7 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); - _mm_store_si128((__m128i*)dptr, xmm5); + STORE_SI128(dptr, xmm5); dptr += 4; } diff --git a/libfreerdp/primitives/sse/prim_avxsse.h b/libfreerdp/primitives/sse/prim_avxsse.h index 4dbc49562..11fdcdfe5 100644 --- a/libfreerdp/primitives/sse/prim_avxsse.h +++ b/libfreerdp/primitives/sse/prim_avxsse.h @@ -19,13 +19,19 @@ */ #pragma once -#include "prim_internal.h" +#include + +#include "../../core/simd.h" #if defined(SSE_AVX_INTRINSICS_ENABLED) #include +#include + static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4) { - return _mm_set_epi32((int32_t)val1, (int32_t)val2, (int32_t)val3, (int32_t)val4); + return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2), + WINPR_CXX_COMPAT_CAST(int32_t, val3), + WINPR_CXX_COMPAT_CAST(int32_t, val4)); } static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4, @@ -33,31 +39,36 @@ static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12, uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16) { - return _mm_set_epi8((int8_t)val1, (int8_t)val2, (int8_t)val3, (int8_t)val4, (int8_t)val5, - (int8_t)val6, (int8_t)val7, (int8_t)val8, (int8_t)val9, (int8_t)val10, - (int8_t)val11, (int8_t)val12, (int8_t)val13, (int8_t)val14, (int8_t)val15, - (int8_t)val16); + return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2), + WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4), + WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6), + WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8), + WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10), + WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12), + WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14), + WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16)); } static inline __m128i mm_set1_epu32(uint32_t val) { - return _mm_set1_epi32((int32_t)val); + return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val)); } static inline __m128i mm_set1_epu8(uint8_t val) { - return _mm_set1_epi8((int8_t)val); + return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val)); } -/* Use lddqu for unaligned; load for 16-byte aligned. */ static inline __m128i LOAD_SI128(const void* ptr) { - const ULONG_PTR uptr = (const ULONG_PTR)ptr; - const __m128i* mptr = (const __m128i*)ptr; - if ((uptr & 0x0f) != 0) - return _mm_loadu_si128(mptr); + const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr); + return _mm_lddqu_si128(mptr); +} - return _mm_load_si128(mptr); +static inline void STORE_SI128(void* ptr, __m128i val) +{ + __m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr); + _mm_storeu_si128(mptr, val); } #endif diff --git a/libfreerdp/primitives/sse/prim_colors_sse2.c b/libfreerdp/primitives/sse/prim_colors_sse2.c index 171f2904d..38520e020 100644 --- a/libfreerdp/primitives/sse/prim_colors_sse2.c +++ b/libfreerdp/primitives/sse/prim_colors_sse2.c @@ -181,32 +181,32 @@ sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ - __m128i y = _mm_load_si128(y_buf + i); + __m128i y = LOAD_SI128(y_buf + i); y = _mm_add_epi16(y, c4096); y = _mm_srai_epi16(y, 2); /* cb = cb_g_buf[i]; */ - __m128i cb = _mm_load_si128(cb_buf + i); + __m128i cb = LOAD_SI128(cb_buf + i); /* cr = cr_b_buf[i]; */ - __m128i cr = _mm_load_si128(cr_buf + i); + __m128i cr = LOAD_SI128(cr_buf + i); /* (y + HIWORD(cr*22986)) >> 3 */ __m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr)); r = _mm_srai_epi16(r, 3); /* r_buf[i] = CLIP(r); */ mm_between_epi16(r, zero, max); - _mm_store_si128(r_buf + i, r); + STORE_SI128(r_buf + i, r); /* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */ __m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb)); g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr)); g = _mm_srai_epi16(g, 3); /* g_buf[i] = CLIP(g); */ mm_between_epi16(g, zero, max); - _mm_store_si128(g_buf + i, g); + STORE_SI128(g_buf + i, g); /* (y + HIWORD(cb*28999)) >> 3 */ __m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb)); b = _mm_srai_epi16(b, 3); /* b_buf[i] = CLIP(b); */ mm_between_epi16(b, zero, max); - _mm_store_si128(b_buf + i, b); + STORE_SI128(b_buf + i, b); } y_buf += srcbump; @@ -291,15 +291,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ - __m128i y1 = _mm_load_si128((const __m128i*)y_buf); + __m128i y1 = LOAD_SI128(y_buf); y_buf += step; y1 = _mm_add_epi16(y1, c4096); y1 = _mm_srai_epi16(y1, 2); /* cb = cb_g_buf[i]; */ - __m128i cb1 = _mm_load_si128((const __m128i*)cb_buf); + __m128i cb1 = LOAD_SI128(cb_buf); cb_buf += step; /* cr = cr_b_buf[i]; */ - __m128i cr1 = _mm_load_si128((const __m128i*)cr_buf); + __m128i cr1 = LOAD_SI128(cr_buf); cr_buf += step; /* (y + HIWORD(cr*22986)) >> 3 */ __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr)); @@ -317,15 +317,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr b1 = _mm_srai_epi16(b1, 3); /* b_buf[i] = CLIP(b); */ mm_between_epi16(b1, zero, max); - __m128i y2 = _mm_load_si128((const __m128i*)y_buf); + __m128i y2 = LOAD_SI128(y_buf); y_buf += step; y2 = _mm_add_epi16(y2, c4096); y2 = _mm_srai_epi16(y2, 2); /* cb = cb_g_buf[i]; */ - __m128i cb2 = _mm_load_si128((const __m128i*)cb_buf); + __m128i cb2 = LOAD_SI128(cb_buf); cb_buf += step; /* cr = cr_b_buf[i]; */ - __m128i cr2 = _mm_load_si128((const __m128i*)cr_buf); + __m128i cr2 = LOAD_SI128(cr_buf); cr_buf += step; /* (y + HIWORD(cr*22986)) >> 3 */ __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr)); @@ -369,13 +369,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr R2 = R3; /* R2 = R3 */ R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */ R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */ - _mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */ + STORE_SI128(d_buf, R0); /* B1G1R1FFB0G0R0FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */ + STORE_SI128(d_buf, R4); /* B3G3R3FFB2G2R2FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */ + STORE_SI128(d_buf, R2); /* B5G5R5FFB4G4R4FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */ + STORE_SI128(d_buf, R3); /* B7G7R7FFB6G6R6FF */ d_buf += sizeof(__m128i); } } @@ -476,15 +476,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3 */ /* y = (y_r_buf[i] + 4096) >> 2 */ - __m128i y1 = _mm_load_si128((const __m128i*)y_buf); + __m128i y1 = LOAD_SI128(y_buf); y_buf += step; y1 = _mm_add_epi16(y1, c4096); y1 = _mm_srai_epi16(y1, 2); /* cb = cb_g_buf[i]; */ - __m128i cb1 = _mm_load_si128((const __m128i*)cb_buf); + __m128i cb1 = LOAD_SI128(cb_buf); cb_buf += step; /* cr = cr_b_buf[i]; */ - __m128i cr1 = _mm_load_si128((const __m128i*)cr_buf); + __m128i cr1 = LOAD_SI128(cr_buf); cr_buf += step; /* (y + HIWORD(cr*22986)) >> 3 */ __m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr)); @@ -502,15 +502,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr b1 = _mm_srai_epi16(b1, 3); /* b_buf[i] = CLIP(b); */ mm_between_epi16(b1, zero, max); - __m128i y2 = _mm_load_si128((const __m128i*)y_buf); + __m128i y2 = LOAD_SI128(y_buf); y_buf += step; y2 = _mm_add_epi16(y2, c4096); y2 = _mm_srai_epi16(y2, 2); /* cb = cb_g_buf[i]; */ - __m128i cb2 = _mm_load_si128((const __m128i*)cb_buf); + __m128i cb2 = LOAD_SI128(cb_buf); cb_buf += step; /* cr = cr_b_buf[i]; */ - __m128i cr2 = _mm_load_si128((const __m128i*)cr_buf); + __m128i cr2 = LOAD_SI128(cr_buf); cr_buf += step; /* (y + HIWORD(cr*22986)) >> 3 */ __m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr)); @@ -554,13 +554,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr R2 = R3; /* R2 = R3 */ R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */ R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */ - _mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */ + STORE_SI128(d_buf, R0); /* R1G1B1FFR0G0B0FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */ + STORE_SI128(d_buf, R4); /* R3G3B3FFR2G2B2FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */ + STORE_SI128(d_buf, R2); /* R5G5B5FFR4G4B4FF */ d_buf += sizeof(__m128i); - _mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */ + STORE_SI128(d_buf, R3); /* R7G7B7FFR6G6B6FF */ d_buf += sizeof(__m128i); } } @@ -694,9 +694,9 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, * within the upper 16 bits we will also have to scale the RGB * values used in the multiplication by << 5+(16-n). */ - __m128i r = _mm_load_si128(r_buf + i); - __m128i g = _mm_load_si128(g_buf + i); - __m128i b = _mm_load_si128(b_buf + i); + __m128i r = LOAD_SI128(r_buf + i); + __m128i g = LOAD_SI128(g_buf + i); + __m128i b = LOAD_SI128(b_buf + i); /* r<<6; g<<6; b<<6 */ r = _mm_slli_epi16(r, 6); g = _mm_slli_epi16(g, 6); @@ -708,21 +708,21 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep, y = _mm_add_epi16(y, min); /* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */ mm_between_epi16(y, min, max); - _mm_store_si128(y_buf + i, y); + STORE_SI128(y_buf + i, y); /* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */ __m128i cb = _mm_mulhi_epi16(r, cb_r); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g)); cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b)); /* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */ mm_between_epi16(cb, min, max); - _mm_store_si128(cb_buf + i, cb); + STORE_SI128(cb_buf + i, cb); /* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */ __m128i cr = _mm_mulhi_epi16(r, cr_r); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g)); cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b)); /* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */ mm_between_epi16(cr, min, max); - _mm_store_si128(cr_buf + i, cr); + STORE_SI128(cr_buf + i, cr); } y_buf += srcbump; @@ -769,27 +769,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX( { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pb); + R0 = LOAD_SI128(pb); pb += 8; /* R0 = 00B300B200B100B0 */ - R1 = _mm_load_si128((const __m128i*)pb); + R1 = LOAD_SI128(pb); pb += 8; /* R1 = 00B700B600B500B4 */ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pg); + R0 = LOAD_SI128(pg); pg += 8; /* R1 = 00G300G200G100G0 */ - R1 = _mm_load_si128((const __m128i*)pg); + R1 = LOAD_SI128(pg); pg += 8; /* R2 = 00G700G600G500G4 */ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pr); + R0 = LOAD_SI128(pr); pr += 8; /* R0 = 00R300R200R100R0 */ - R1 = _mm_load_si128((const __m128i*)pr); + R1 = LOAD_SI128(pr); pr += 8; /* R3 = 00R700R600R500R4 */ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */ } @@ -801,22 +801,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX( { const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR1G1B1FFR0G0B0 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR3G3B3FFR2G2B2 */ } { const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR5G5B5FFR4G4B4 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR7G7B7FFR6G6B6 */ } } @@ -875,27 +875,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX( { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pb); + R0 = LOAD_SI128(pb); pb += 8; /* R0 = 00B300B200B100B0 */ - R1 = _mm_load_si128((const __m128i*)pb); + R1 = LOAD_SI128(pb); pb += 8; /* R1 = 00B700B600B500B4 */ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pg); + R0 = LOAD_SI128(pg); pg += 8; /* R1 = 00G300G200G100G0 */ - R1 = _mm_load_si128((const __m128i*)pg); + R1 = LOAD_SI128(pg); pg += 8; /* R2 = 00G700G600G500G4 */ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pr); + R0 = LOAD_SI128(pr); pr += 8; /* R0 = 00R300R200R100R0 */ - R1 = _mm_load_si128((const __m128i*)pr); + R1 = LOAD_SI128(pr); pr += 8; /* R3 = 00R700R600R500R4 */ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */ } @@ -912,22 +912,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX( } { const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR1G1B1FFR0G0B0 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR3G3B3FFR2G2B2 */ } { const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR5G5B5FFR4G4B4 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR7G7B7FFR6G6B6 */ } } @@ -986,27 +986,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR( { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pb); + R0 = LOAD_SI128(pb); pb += 8; /* R0 = 00B300B200B100B0 */ - R1 = _mm_load_si128((const __m128i*)pb); + R1 = LOAD_SI128(pb); pb += 8; /* R1 = 00B700B600B500B4 */ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pg); + R0 = LOAD_SI128(pg); pg += 8; /* R1 = 00G300G200G100G0 */ - R1 = _mm_load_si128((const __m128i*)pg); + R1 = LOAD_SI128(pg); pg += 8; /* R2 = 00G700G600G500G4 */ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pr); + R0 = LOAD_SI128(pr); pr += 8; /* R0 = 00R300R200R100R0 */ - R1 = _mm_load_si128((const __m128i*)pr); + R1 = LOAD_SI128(pr); pr += 8; /* R3 = 00R700R600R500R4 */ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */ } @@ -1023,22 +1023,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR( } { const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR1G1B1FFR0G0B0 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR3G3B3FFR2G2B2 */ } { const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR5G5B5FFR4G4B4 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR7G7B7FFR6G6B6 */ } } @@ -1097,27 +1097,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB( { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pb); + R0 = LOAD_SI128(pb); pb += 8; /* R0 = 00B300B200B100B0 */ - R1 = _mm_load_si128((const __m128i*)pb); + R1 = LOAD_SI128(pb); pb += 8; /* R1 = 00B700B600B500B4 */ b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pg); + R0 = LOAD_SI128(pg); pg += 8; /* R1 = 00G300G200G100G0 */ - R1 = _mm_load_si128((const __m128i*)pg); + R1 = LOAD_SI128(pg); pg += 8; /* R2 = 00G700G600G500G4 */ g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */ } { __m128i R0; __m128i R1; - R0 = _mm_load_si128((const __m128i*)pr); + R0 = LOAD_SI128(pr); pr += 8; /* R0 = 00R300R200R100R0 */ - R1 = _mm_load_si128((const __m128i*)pr); + R1 = LOAD_SI128(pr); pr += 8; /* R3 = 00R700R600R500R4 */ r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */ } @@ -1134,22 +1134,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB( } { const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR1G1B1FFR0G0B0 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR3G3B3FFR2G2B2 */ } { const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR5G5B5FFR4G4B4 */ } { const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi); - _mm_store_si128((__m128i*)out, bgrx); + STORE_SI128(out, bgrx); out += 16; /* FFR7G7B7FFR6G6B6 */ } } @@ -1217,7 +1217,8 @@ void primitives_init_colors_sse2(primitives_t* prims) generic = primitives_get_generic(); primitives_init_colors(prims); - if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) + if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && + IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { WLog_VRB(PRIM_TAG, "SSE2 optimizations"); prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R; diff --git a/libfreerdp/primitives/sse/prim_copy_sse4_1.c b/libfreerdp/primitives/sse/prim_copy_sse4_1.c index c484408e0..d7dad682a 100644 --- a/libfreerdp/primitives/sse/prim_copy_sse4_1.c +++ b/libfreerdp/primitives/sse/prim_copy_sse4_1.c @@ -22,6 +22,7 @@ #include #include +#include "prim_internal.h" #include "prim_avxsse.h" #include "prim_copy.h" #include "../codec/color.h" @@ -68,12 +69,12 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat { const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte]; __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte]; - const __m128i s0 = _mm_loadu_si128(src); + const __m128i s0 = LOAD_SI128(src); const __m128i s1 = _mm_shuffle_epi8(s0, smask); - const __m128i s2 = _mm_loadu_si128(dst); + const __m128i s2 = LOAD_SI128(dst); __m128i d0 = _mm_blendv_epi8(s1, s2, mask); - _mm_storeu_si128(dst, d0); + STORE_SI128(dst, d0); } } for (; x < nWidth; x++) @@ -118,10 +119,10 @@ static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstDa { const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte]; __m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte]; - const __m128i s0 = _mm_loadu_si128(src); - const __m128i s1 = _mm_loadu_si128(dst); + const __m128i s0 = LOAD_SI128(src); + const __m128i s1 = LOAD_SI128(dst); __m128i d0 = _mm_blendv_epi8(s1, s0, mask); - _mm_storeu_si128(dst, d0); + STORE_SI128(dst, d0); } for (; x < nWidth; x++) diff --git a/libfreerdp/primitives/sse/prim_set_sse2.c b/libfreerdp/primitives/sse/prim_set_sse2.c index e43255177..45dc0f9ff 100644 --- a/libfreerdp/primitives/sse/prim_set_sse2.c +++ b/libfreerdp/primitives/sse/prim_set_sse2.c @@ -21,7 +21,8 @@ #include #include -#include "prim_avxsse.h".h " +#include "prim_internal.h" +#include "prim_avxsse.h" #include "prim_set.h" /* ========================================================================= */ @@ -60,37 +61,37 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len) /* Do 256-byte chunks using one XMM register. */ while (count--) { - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; } @@ -101,7 +102,7 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len) /* Do 16-byte chunks using one XMM register. */ while (count--) { - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 16; } @@ -152,37 +153,37 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le /* Do 256-byte chunks using one XMM register. */ while (count--) { - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; } @@ -193,7 +194,7 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le /* Do 16-byte chunks using one XMM register. */ while (count--) { - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 4; } @@ -220,7 +221,8 @@ void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims) primitives_init_set(prims); /* Pick tuned versions if possible. */ - if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) + if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && + IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) { WLog_VRB(PRIM_TAG, "SSE2 optimizations"); prims->set_8u = sse2_set_8u; diff --git a/libfreerdp/primitives/sse/prim_shift_sse3.c b/libfreerdp/primitives/sse/prim_shift_sse3.c index b6e708b48..7709dba0e 100644 --- a/libfreerdp/primitives/sse/prim_shift_sse3.c +++ b/libfreerdp/primitives/sse/prim_shift_sse3.c @@ -79,14 +79,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 { const __m128i* src = (const __m128i*)pSrcDst; - __m128i xmm0 = _mm_load_si128(src++); - __m128i xmm1 = _mm_load_si128(src++); - __m128i xmm2 = _mm_load_si128(src++); - __m128i xmm3 = _mm_load_si128(src++); - __m128i xmm4 = _mm_load_si128(src++); - __m128i xmm5 = _mm_load_si128(src++); - __m128i xmm6 = _mm_load_si128(src++); - __m128i xmm7 = _mm_load_si128(src); + __m128i xmm0 = LOAD_SI128(src++); + __m128i xmm1 = LOAD_SI128(src++); + __m128i xmm2 = LOAD_SI128(src++); + __m128i xmm3 = LOAD_SI128(src++); + __m128i xmm4 = LOAD_SI128(src++); + __m128i xmm5 = LOAD_SI128(src++); + __m128i xmm6 = LOAD_SI128(src++); + __m128i xmm7 = LOAD_SI128(src); xmm0 = _mm_slli_epi16(xmm0, (int16_t)val); xmm1 = _mm_slli_epi16(xmm1, (int16_t)val); @@ -99,14 +99,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 __m128i* dst = (__m128i*)pSrcDst; - _mm_store_si128(dst++, xmm0); - _mm_store_si128(dst++, xmm1); - _mm_store_si128(dst++, xmm2); - _mm_store_si128(dst++, xmm3); - _mm_store_si128(dst++, xmm4); - _mm_store_si128(dst++, xmm5); - _mm_store_si128(dst++, xmm6); - _mm_store_si128(dst++, xmm7); + STORE_SI128(dst++, xmm0); + STORE_SI128(dst++, xmm1); + STORE_SI128(dst++, xmm2); + STORE_SI128(dst++, xmm3); + STORE_SI128(dst++, xmm4); + STORE_SI128(dst++, xmm5); + STORE_SI128(dst++, xmm6); + STORE_SI128(dst++, xmm7); pSrcDst = (INT16*)dst; } @@ -122,7 +122,7 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 xmm0 = _mm_slli_epi16(xmm0, (int16_t)val); __m128i* dst = (__m128i*)pSrcDst; - _mm_store_si128(dst++, xmm0); + STORE_SI128(dst++, xmm0); pSrcDst = (INT16*)dst; } diff --git a/libfreerdp/primitives/sse/prim_sign_ssse3.c b/libfreerdp/primitives/sse/prim_sign_ssse3.c index 8f7bb8674..8bd41d045 100644 --- a/libfreerdp/primitives/sse/prim_sign_ssse3.c +++ b/libfreerdp/primitives/sse/prim_sign_ssse3.c @@ -21,6 +21,7 @@ #include "prim_sign.h" +#include "prim_internal.h" #include "prim_avxsse.h" #if defined(SSE_AVX_INTRINSICS_ENABLED) @@ -79,25 +80,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); - xmm4 = _mm_lddqu_si128((const __m128i*)sptr); + xmm4 = LOAD_SI128(sptr); sptr += 8; - xmm5 = _mm_lddqu_si128((const __m128i*)sptr); + xmm5 = LOAD_SI128(sptr); sptr += 8; - xmm6 = _mm_lddqu_si128((const __m128i*)sptr); + xmm6 = LOAD_SI128(sptr); sptr += 8; - xmm7 = _mm_lddqu_si128((const __m128i*)sptr); + xmm7 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm1); + STORE_SI128(dptr, xmm1); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm2); + STORE_SI128(dptr, xmm2); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm3); + STORE_SI128(dptr, xmm3); dptr += 8; } } @@ -118,25 +119,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); - xmm4 = _mm_load_si128((const __m128i*)sptr); + xmm4 = LOAD_SI128(sptr); sptr += 8; - xmm5 = _mm_load_si128((const __m128i*)sptr); + xmm5 = LOAD_SI128(sptr); sptr += 8; - xmm6 = _mm_load_si128((const __m128i*)sptr); + xmm6 = LOAD_SI128(sptr); sptr += 8; - xmm7 = _mm_load_si128((const __m128i*)sptr); + xmm7 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm1); + STORE_SI128(dptr, xmm1); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm2); + STORE_SI128(dptr, xmm2); dptr += 8; - _mm_store_si128((__m128i*)dptr, xmm3); + STORE_SI128(dptr, xmm3); dptr += 8; } } @@ -151,7 +152,7 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm1); - _mm_store_si128((__m128i*)dptr, xmm0); + STORE_SI128(dptr, xmm0); dptr += 8; }