[primitives,sse] unify load/store

* Use LOAD_SI128 to load __m128i values * Use STORE_SI128 to store __m128i values
2026-04-14 00:14:11 +09:00 · 2025-02-11 15:47:55 +01:00
parent 76012aac42
commit fd13e9b919
14 changed files with 357 additions and 375 deletions
--- a/libfreerdp/codec/CMakeLists.txt
+++ b/libfreerdp/codec/CMakeLists.txt
@@ -41,20 +41,20 @@ set(CODEC_SRCS
    yuv.c
 )

-set(CODEC_SSE2_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h)
+set(CODEC_SSE3_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h)

 set(CODEC_NEON_SRCS neon/rfx_neon.c neon/rfx_neon.h neon/nsc_neon.c neon/nsc_neon.h)

 # Append initializers
 set(CODEC_LIBS "")
-list(APPEND CODEC_SRCS ${CODEC_SSE2_SRCS})
+list(APPEND CODEC_SRCS ${CODEC_SSE3_SRCS})
 list(APPEND CODEC_SRCS ${CODEC_NEON_SRCS})

 include(CompilerDetect)
 include(DetectIntrinsicSupport)

 if(WITH_SIMD)
-  set_simd_source_file_properties("sse2" ${CODEC_SSE2_SRCS})
+  set_simd_source_file_properties("sse3" ${CODEC_SSE3_SRCS})
  set_simd_source_file_properties("neon" ${CODEC_NEON_SRCS})
 endif()

--- a/libfreerdp/codec/sse/nsc_sse2.c
+++ b/libfreerdp/codec/sse/nsc_sse2.c
@@ -26,6 +26,7 @@
 #include "nsc_sse2.h"

 #include "../../core/simd.h"
+#include "../../primitives/sse/prim_avxsse.h"

 #if defined(SSE_AVX_INTRINSICS_ENABLED)
 #include <stdio.h>
@@ -290,13 +291,13 @@ static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, const BYTE* dat
 			cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1));
 			cg_val = _mm_srai_epi16(cg_val, ccl);
 			y_val = _mm_packus_epi16(y_val, y_val);
-			_mm_storeu_si128((__m128i*)yplane, y_val);
+			STORE_SI128(yplane, y_val);
 			co_val = _mm_packs_epi16(co_val, co_val);
-			_mm_storeu_si128((__m128i*)coplane, co_val);
+			STORE_SI128(coplane, co_val);
 			cg_val = _mm_packs_epi16(cg_val, cg_val);
-			_mm_storeu_si128((__m128i*)cgplane, cg_val);
+			STORE_SI128(cgplane, cg_val);
 			a_val = _mm_packus_epi16(a_val, a_val);
-			_mm_storeu_si128((__m128i*)aplane, a_val);
+			STORE_SI128(aplane, a_val);
 			yplane += 8;
 			coplane += 8;
 			cgplane += 8;
@@ -354,21 +355,21 @@ static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context)

 		for (UINT32 x = 0; x < tempWidth >> 1; x += 8)
 		{
-			t = _mm_loadu_si128((__m128i*)co_src0);
-			t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)co_src1));
+			t = LOAD_SI128(co_src0);
+			t = _mm_avg_epu8(t, LOAD_SI128(co_src1));
 			val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
 			val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
 			val = _mm_packus_epi16(val, val);
-			_mm_storeu_si128((__m128i*)co_dst, val);
+			STORE_SI128(co_dst, val);
 			co_dst += 8;
 			co_src0 += 16;
 			co_src1 += 16;
-			t = _mm_loadu_si128((__m128i*)cg_src0);
-			t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)cg_src1));
+			t = LOAD_SI128(cg_src0);
+			t = _mm_avg_epu8(t, LOAD_SI128(cg_src1));
 			val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
 			val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
 			val = _mm_packus_epi16(val, val);
-			_mm_storeu_si128((__m128i*)cg_dst, val);
+			STORE_SI128(cg_dst, val);
 			cg_dst += 8;
 			cg_src0 += 16;
 			cg_src1 += 16;
@@ -391,7 +392,8 @@ static BOOL nsc_encode_sse2(NSC_CONTEXT* context, const BYTE* data, UINT32 scanl
 void nsc_init_sse2(NSC_CONTEXT* context)
 {
 #if defined(SSE_AVX_INTRINSICS_ENABLED)
-	if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 		return;

 	PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2")
--- a/libfreerdp/codec/sse/rfx_sse2.c
+++ b/libfreerdp/codec/sse/rfx_sse2.c
@@ -27,6 +27,7 @@
 #include "rfx_sse2.h"

 #include "../../core/simd.h"
+#include "../../primitives/sse/prim_avxsse.h"

 #if defined(SSE_AVX_INTRINSICS_ENABLED)
 #include <stdio.h>
@@ -75,10 +76,10 @@ rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t bu

 	do
 	{
-		const __m128i la = _mm_load_si128(ptr);
+		const __m128i la = LOAD_SI128(ptr);
 		const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(int, factor));

-		_mm_store_si128(ptr, a);
+		STORE_SI128(ptr, a);
 		ptr++;
 	} while (ptr < buf_end);
 }
@@ -116,10 +117,10 @@ rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const unsigned

 	do
 	{
-		const __m128i la = _mm_load_si128(ptr);
+		const __m128i la = LOAD_SI128(ptr);
 		__m128i a = _mm_add_epi16(la, half);
 		a = _mm_srai_epi16(a, factor);
-		_mm_store_si128(ptr, a);
+		STORE_SI128(ptr, a);
 		ptr++;
 	} while (ptr < buf_end);
 }
@@ -177,9 +178,9 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
 		for (size_t n = 0; n < subband_width; n += 8)
 		{
 			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
-			__m128i l_n = _mm_load_si128((__m128i*)l_ptr);
-			__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
-			__m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1));
+			__m128i l_n = LOAD_SI128(l_ptr);
+			__m128i h_n = LOAD_SI128(h_ptr);
+			__m128i h_n_m = LOAD_SI128(h_ptr - 1);

 			if (n == 0)
 			{
@@ -191,7 +192,7 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
 			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
 			tmp_n = _mm_srai_epi16(tmp_n, 1);
 			const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
-			_mm_store_si128((__m128i*)l_ptr, dst_n);
+			STORE_SI128(l_ptr, dst_n);
 			l_ptr += 8;
 			h_ptr += 8;
 		}
@@ -203,10 +204,10 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
 		for (size_t n = 0; n < subband_width; n += 8)
 		{
 			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
-			__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
+			__m128i h_n = LOAD_SI128(h_ptr);
 			h_n = _mm_slli_epi16(h_n, 1);
-			__m128i dst_n = _mm_load_si128((__m128i*)(l_ptr));
-			__m128i dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1));
+			__m128i dst_n = LOAD_SI128(l_ptr);
+			__m128i dst_n_p = LOAD_SI128(l_ptr + 1);

 			if (n == subband_width - 8)
 			{
@@ -219,8 +220,8 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
 			tmp_n = _mm_add_epi16(tmp_n, h_n);
 			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
 			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
-			_mm_store_si128((__m128i*)dst_ptr, dst1);
-			_mm_store_si128((__m128i*)(dst_ptr + 8), dst2);
+			STORE_SI128(dst_ptr, dst1);
+			STORE_SI128(dst_ptr + 8, dst2);
 			l_ptr += 8;
 			h_ptr += 8;
 			dst_ptr += 16;
@@ -243,21 +244,21 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
 		for (size_t x = 0; x < total_width; x += 8)
 		{
 			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
-			const __m128i l_n = _mm_load_si128((__m128i*)l_ptr);
-			const __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
+			const __m128i l_n = LOAD_SI128(l_ptr);
+			const __m128i h_n = LOAD_SI128(h_ptr);
 			__m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));

 			if (n == 0)
 				tmp_n = _mm_add_epi16(tmp_n, h_n);
 			else
 			{
-				const __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width));
+				const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
 				tmp_n = _mm_add_epi16(tmp_n, h_n_m);
 			}

 			tmp_n = _mm_srai_epi16(tmp_n, 1);
 			const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
-			_mm_store_si128((__m128i*)dst_ptr, dst_n);
+			STORE_SI128(dst_ptr, dst_n);
 			l_ptr += 8;
 			h_ptr += 8;
 			dst_ptr += 8;
@@ -275,8 +276,8 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
 		for (size_t x = 0; x < total_width; x += 8)
 		{
 			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
-			__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
-			__m128i dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width));
+			__m128i h_n = LOAD_SI128(h_ptr);
+			__m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
 			h_n = _mm_slli_epi16(h_n, 1);
 			__m128i tmp_n = dst_n_m;

@@ -284,13 +285,13 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
 				tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
 			else
 			{
-				const __m128i dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width));
+				const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
 				tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
 			}

 			tmp_n = _mm_srai_epi16(tmp_n, 1);
 			const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
-			_mm_store_si128((__m128i*)dst_ptr, dst_n);
+			STORE_SI128(dst_ptr, dst_n);
 			h_ptr += 8;
 			dst_ptr += 8;
 		}
@@ -342,29 +343,29 @@ rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRI
 	{
 		for (size_t x = 0; x < total_width; x += 8)
 		{
-			__m128i src_2n = _mm_load_si128((__m128i*)src);
-			__m128i src_2n_1 = _mm_load_si128((__m128i*)(src + total_width));
+			__m128i src_2n = LOAD_SI128(src);
+			__m128i src_2n_1 = LOAD_SI128(src + total_width);
 			__m128i src_2n_2 = src_2n;

 			if (n < subband_width - 1)
-				src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width));
+				src_2n_2 = LOAD_SI128(src + 2ULL * total_width);

 			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
 			__m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
 			h_n = _mm_srai_epi16(h_n, 1);
 			h_n = _mm_sub_epi16(src_2n_1, h_n);
 			h_n = _mm_srai_epi16(h_n, 1);
-			_mm_store_si128((__m128i*)h, h_n);
+			STORE_SI128(h, h_n);

 			__m128i h_n_m = h_n;
 			if (n != 0)
-				h_n_m = _mm_load_si128((__m128i*)(h - total_width));
+				h_n_m = LOAD_SI128(h - total_width);

 			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
 			__m128i l_n = _mm_add_epi16(h_n_m, h_n);
 			l_n = _mm_srai_epi16(l_n, 1);
 			l_n = _mm_add_epi16(l_n, src_2n);
-			_mm_store_si128((__m128i*)l, l_n);
+			STORE_SI128(l, l_n);
 			src += 8;
 			l += 8;
 			h += 8;
@@ -396,8 +397,8 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR
 			h_n = _mm_srai_epi16(h_n, 1);
 			h_n = _mm_sub_epi16(src_2n_1, h_n);
 			h_n = _mm_srai_epi16(h_n, 1);
-			_mm_store_si128((__m128i*)h, h_n);
-			__m128i h_n_m = _mm_loadu_si128((__m128i*)(h - 1));
+			STORE_SI128(h, h_n);
+			__m128i h_n_m = LOAD_SI128(h - 1);

 			if (n == 0)
 			{
@@ -409,7 +410,7 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR
 			__m128i l_n = _mm_add_epi16(h_n_m, h_n);
 			l_n = _mm_srai_epi16(l_n, 1);
 			l_n = _mm_add_epi16(l_n, src_2n);
-			_mm_store_si128((__m128i*)l, l_n);
+			STORE_SI128(l, l_n);
 			src += 16;
 			l += 8;
 			h += 8;
@@ -453,7 +454,8 @@ static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RE
 void rfx_init_sse2(RFX_CONTEXT* context)
 {
 #if defined(SSE_AVX_INTRINSICS_ENABLED)
-	if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
+	if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
+	    !IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 		return;

 	PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2")
--- a/libfreerdp/primitives/CMakeLists.txt
+++ b/libfreerdp/primitives/CMakeLists.txt
@@ -25,9 +25,16 @@ set(PRIMITIVES_SRCS
    prim_internal.h
 )

-set(PRIMITIVES_SSE2_SRCS sse/prim_colors_sse2.c sse/prim_set_sse2.c sse/prim_avxsse.h sse/prim_templates.h)
-
-set(PRIMITIVES_SSE3_SRCS sse/prim_add_sse3.c sse/prim_alphaComp_sse3.c sse/prim_andor_sse3.c sse/prim_shift_sse3.c)
+set(PRIMITIVES_SSE3_SRCS
+    sse/prim_avxsse.h
+    sse/prim_templates.h
+    sse/prim_colors_sse2.c
+    sse/prim_set_sse2.c
+    sse/prim_add_sse3.c
+    sse/prim_alphaComp_sse3.c
+    sse/prim_andor_sse3.c
+    sse/prim_shift_sse3.c
+)

 set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c)

@@ -58,14 +65,8 @@ if(WITH_OPENCL)
  freerdp_library_add(OpenCL::OpenCL)
 endif()

-set(PRIMITIVES_OPT_SRCS
-    ${PRIMITIVES_NEON_SRCS}
-    ${PRIMITIVES_SSE2_SRCS}
-    ${PRIMITIVES_SSE3_SRCS}
-    ${PRIMITIVES_SSSE3_SRCS}
-    ${PRIMITIVES_SSE4_1_SRCS}
-    ${PRIMITIVES_SSE4_2_SRCS}
-    ${PRIMITIVES_OPENCL_SRCS}
+set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS}
+                        ${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS}
 )

 if(WITH_AVX2)
@@ -80,7 +81,6 @@ add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS})
 include(CompilerDetect)
 include(DetectIntrinsicSupport)
 if(WITH_SIMD)
-  set_simd_source_file_properties("sse2" ${PRIMITIVES_SSE2_SRCS})
  set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS})
  set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS})
  set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS})
--- a/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
+++ b/libfreerdp/primitives/sse/prim_YCoCg_ssse3.c
@@ -73,7 +73,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
 	for (UINT32 h = 0; h < height; h++)
 	{
 		UINT32 w = width;
-		BOOL onStride = 0;

 		/* Get to a 16-byte destination boundary. */
 		if ((ULONG_PTR)dptr & 0x0f)
@@ -96,9 +95,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
 			w -= startup;
 		}

-		/* Each loop handles eight pixels at a time. */
-		onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
-
 		while (w >= 8)
 		{
 			__m128i R0;
@@ -110,22 +106,10 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
 			__m128i R6;
 			__m128i R7;

-			if (onStride)
-			{
-				/* The faster path, 16-byte aligned load. */
-				R0 = _mm_load_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-				R1 = _mm_load_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-			}
-			else
-			{
-				/* Off-stride, slower LDDQU load. */
-				R0 = _mm_lddqu_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-				R1 = _mm_lddqu_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-			}
+			R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);

 			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
 			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
@@ -197,9 +181,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
 			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
 			R5 = _mm_unpackhi_epi16(R2, R3);
 			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
-			_mm_store_si128((__m128i*)dptr, R4);
+			STORE_SI128(dptr, R4);
 			dptr += (128 / 8);
-			_mm_store_si128((__m128i*)dptr, R5);
+			STORE_SI128(dptr, R5);
 			dptr += (128 / 8);
 			w -= 8;
 		}
@@ -262,7 +246,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
 	for (UINT32 h = 0; h < height; h++)
 	{
 		UINT32 w = width;
-		BOOL onStride = 0;

 		/* Get to a 16-byte destination boundary. */
 		if ((ULONG_PTR)dptr & 0x0f)
@@ -285,47 +268,26 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
 			w -= startup;
 		}

-		/* Each loop handles eight pixels at a time. */
-		onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
-
 		while (w >= 8)
 		{
-			__m128i R0;
-			__m128i R1;
-			__m128i R2;
-			__m128i R3;
-			__m128i R4;
-			__m128i R5;
-			__m128i R6;
 			__m128i R7;

-			if (onStride)
-			{
-				/* The faster path, 16-byte aligned load. */
-				R0 = _mm_load_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-				R1 = _mm_load_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-			}
-			else
-			{
-				/* Off-stride, slower LDDQU load. */
-				R0 = _mm_lddqu_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-				R1 = _mm_lddqu_si128((const __m128i*)sptr);
-				sptr += (128 / 8);
-			}
+			/* The faster path, 16-byte aligned load. */
+			__m128i R0 = LOAD_SI128(sptr);
+			sptr += (128 / 8);
+			__m128i R1 = LOAD_SI128(sptr);
+			sptr += (128 / 8);

 			/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
 			/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
 			/* Shuffle to pack all the like types together. */
-			R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
-			R3 = _mm_shuffle_epi8(R0, R2);
-			R4 = _mm_shuffle_epi8(R1, R2);
+			__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+			__m128i R3 = _mm_shuffle_epi8(R0, R2);
+			__m128i R4 = _mm_shuffle_epi8(R1, R2);
 			/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
 			/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
-			R5 = _mm_unpackhi_epi32(R3, R4);
-			R6 = _mm_unpacklo_epi32(R3, R4);
+			__m128i R5 = _mm_unpackhi_epi32(R3, R4);
+			__m128i R6 = _mm_unpacklo_epi32(R3, R4);

 			/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
 			/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
@@ -390,9 +352,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
 			/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
 			R5 = _mm_unpackhi_epi16(R2, R3);
 			/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
-			_mm_store_si128((__m128i*)dptr, R4);
+			STORE_SI128(dptr, R4);
 			dptr += (128 / 8);
-			_mm_store_si128((__m128i*)dptr, R5);
+			STORE_SI128(dptr, R5);
 			dptr += (128 / 8);
 			w -= 8;
 		}
@@ -456,6 +418,7 @@ void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
 	primitives_init_YCoCg(prims);

 	if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
+	    IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
 	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 	{
 		WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
--- a/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
+++ b/libfreerdp/primitives/sse/prim_YUV_sse4.1.c
@@ -28,6 +28,7 @@
 #include <freerdp/types.h>
 #include <freerdp/primitives.h>

+#include "prim_internal.h"
 #include "prim_avxsse.h"
 #include "prim_YUV.h"

@@ -56,7 +57,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr
 		                     mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
 		                     mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
 	const __m128i c128 = _mm_set1_epi16(128);
-	__m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
+	__m128i BGRX = _mm_and_si128(LOAD_SI128(dst),
 	                             mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
 	{
 		__m128i C;
@@ -117,7 +118,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr
 			BGRX = _mm_or_si128(BGRX, packed);
 		}
 	}
-	_mm_storeu_si128(dst++, BGRX);
+	STORE_SI128(dst++, BGRX);
 	return dst;
 }

@@ -140,9 +141,9 @@ static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[]

 		for (UINT32 x = 0; x < nWidth - pad; x += 16)
 		{
-			const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
-			const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
-			const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
+			const __m128i Y = LOAD_SI128(YData);
+			const __m128i uRaw = LOAD_SI128(UData);
+			const __m128i vRaw = LOAD_SI128(VData);
 			const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
 			const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
 			YData += 16;
@@ -445,12 +446,9 @@ static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
 	size_t x = 0;
 	for (; x < nWidth - pad; x += 16)
 	{
-		const __m128i Y[] = { _mm_loadu_si128((const __m128i*)&YData[0][x]),
-			                  _mm_loadu_si128((const __m128i*)&YData[1][x]) };
-		__m128i U[] = { _mm_loadu_si128((const __m128i*)&UData[0][x]),
-			            _mm_loadu_si128((const __m128i*)&UData[1][x]) };
-		__m128i V[] = { _mm_loadu_si128((const __m128i*)&VData[0][x]),
-			            _mm_loadu_si128((const __m128i*)&VData[1][x]) };
+		const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) };
+		__m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) };
+		__m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) };

 		BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
 		sse41_BGRX_fillRGB(dstp, Y, U, V);
@@ -636,21 +634,21 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE
 	for (; x < width - width % 16; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers */
-		__m128i x0 = _mm_loadu_si128(argb++); // 1st 4 pixels
+		__m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels
 		{
 			x0 = _mm_maddubs_epi16(x0, y_factors);

-			__m128i x1 = _mm_loadu_si128(argb++); // 2nd 4 pixels
+			__m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels
 			x1 = _mm_maddubs_epi16(x1, y_factors);
 			x0 = _mm_hadds_epi16(x0, x1);
 			x0 = _mm_srli_epi16(x0, Y_SHIFT);
 		}

-		__m128i x2 = _mm_loadu_si128(argb++); // 3rd 4 pixels
+		__m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels
 		{
 			x2 = _mm_maddubs_epi16(x2, y_factors);

-			__m128i x3 = _mm_loadu_si128(argb++); // 4th 4 pixels
+			__m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels
 			x3 = _mm_maddubs_epi16(x3, y_factors);
 			x2 = _mm_hadds_epi16(x2, x3);
 			x2 = _mm_srli_epi16(x2, Y_SHIFT);
@@ -658,7 +656,7 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE

 		x0 = _mm_packus_epi16(x0, x2);
 		/* save to y plane */
-		_mm_storeu_si128(ydst++, x0);
+		STORE_SI128(ydst++, x0);
 	}

 	for (; x < width; x++)
@@ -688,20 +686,20 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
 		__m64* vdst = (__m64*)&dst2[x / 2];

 		/* subsample 16x2 pixels into 16x1 pixels */
-		__m128i x0 = _mm_loadu_si128(&rgb1[0]);
-		__m128i x4 = _mm_loadu_si128(&rgb2[0]);
+		__m128i x0 = LOAD_SI128(&rgb1[0]);
+		__m128i x4 = LOAD_SI128(&rgb2[0]);
 		x0 = _mm_avg_epu8(x0, x4);

-		__m128i x1 = _mm_loadu_si128(&rgb1[1]);
-		x4 = _mm_loadu_si128(&rgb2[1]);
+		__m128i x1 = LOAD_SI128(&rgb1[1]);
+		x4 = LOAD_SI128(&rgb2[1]);
 		x1 = _mm_avg_epu8(x1, x4);

-		__m128i x2 = _mm_loadu_si128(&rgb1[2]);
-		x4 = _mm_loadu_si128(&rgb2[2]);
+		__m128i x2 = LOAD_SI128(&rgb1[2]);
+		x4 = LOAD_SI128(&rgb2[2]);
 		x2 = _mm_avg_epu8(x2, x4);

-		__m128i x3 = _mm_loadu_si128(&rgb1[3]);
-		x4 = _mm_loadu_si128(&rgb2[3]);
+		__m128i x3 = LOAD_SI128(&rgb1[3]);
+		x4 = LOAD_SI128(&rgb2[3]);
 		x3 = _mm_avg_epu8(x3, x4);

 		/* subsample these 16x1 pixels into 8x1 pixels */
@@ -827,14 +825,14 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
 	for (; x < width - width % 16; x += 16)
 	{
 		/* store 16 rgba pixels in 4 128 bit registers */
-		const __m128i xe1 = _mm_loadu_si128(argbEven++); // 1st 4 pixels
-		const __m128i xe2 = _mm_loadu_si128(argbEven++); // 2nd 4 pixels
-		const __m128i xe3 = _mm_loadu_si128(argbEven++); // 3rd 4 pixels
-		const __m128i xe4 = _mm_loadu_si128(argbEven++); // 4th 4 pixels
-		const __m128i xo1 = _mm_loadu_si128(argbOdd++);  // 1st 4 pixels
-		const __m128i xo2 = _mm_loadu_si128(argbOdd++);  // 2nd 4 pixels
-		const __m128i xo3 = _mm_loadu_si128(argbOdd++);  // 3rd 4 pixels
-		const __m128i xo4 = _mm_loadu_si128(argbOdd++);  // 4th 4 pixels
+		const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels
+		const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels
+		const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels
+		const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels
+		const __m128i xo1 = LOAD_SI128(argbOdd++);  // 1st 4 pixels
+		const __m128i xo2 = LOAD_SI128(argbOdd++);  // 2nd 4 pixels
+		const __m128i xo3 = LOAD_SI128(argbOdd++);  // 3rd 4 pixels
+		const __m128i xo4 = LOAD_SI128(argbOdd++);  // 4th 4 pixels
 		{
 			/* Y: multiplications with subtotals and horizontal sums */
 			const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
@@ -852,12 +850,12 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
 			                                   Y_SHIFT);
 			const __m128i yo = _mm_packus_epi16(yo1, yo2);
 			/* store y [b1] */
-			_mm_storeu_si128((__m128i*)b1Even, ye);
+			STORE_SI128(b1Even, ye);
 			b1Even += 16;

 			if (b1Odd)
 			{
-				_mm_storeu_si128((__m128i*)b1Odd, yo);
+				STORE_SI128(b1Odd, yo);
 				b1Odd += 16;
 			}
 		}
@@ -925,7 +923,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(

 			if (b1Odd) /* b4 */
 			{
-				_mm_storeu_si128((__m128i*)b4, uo);
+				STORE_SI128(b4, uo);
 				b4 += 16;
 			}

@@ -1003,7 +1001,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(

 			if (b1Odd) /* b5 */
 			{
-				_mm_storeu_si128((__m128i*)b5, vo);
+				STORE_SI128(b5, vo);
 				b5 += 16;
 			}

@@ -1117,14 +1115,14 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 		/* store 16 rgba pixels in 4 128 bit registers
 		 * for even and odd rows.
 		 */
-		const __m128i xe1 = _mm_loadu_si128(argbEven++); /* 1st 4 pixels */
-		const __m128i xe2 = _mm_loadu_si128(argbEven++); /* 2nd 4 pixels */
-		const __m128i xe3 = _mm_loadu_si128(argbEven++); /* 3rd 4 pixels */
-		const __m128i xe4 = _mm_loadu_si128(argbEven++); /* 4th 4 pixels */
-		const __m128i xo1 = _mm_loadu_si128(argbOdd++);  /* 1st 4 pixels */
-		const __m128i xo2 = _mm_loadu_si128(argbOdd++);  /* 2nd 4 pixels */
-		const __m128i xo3 = _mm_loadu_si128(argbOdd++);  /* 3rd 4 pixels */
-		const __m128i xo4 = _mm_loadu_si128(argbOdd++);  /* 4th 4 pixels */
+		const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */
+		const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */
+		const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */
+		const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */
+		const __m128i xo1 = LOAD_SI128(argbOdd++);  /* 1st 4 pixels */
+		const __m128i xo2 = LOAD_SI128(argbOdd++);  /* 2nd 4 pixels */
+		const __m128i xo3 = LOAD_SI128(argbOdd++);  /* 3rd 4 pixels */
+		const __m128i xo4 = LOAD_SI128(argbOdd++);  /* 4th 4 pixels */
 		{
 			/* Y: multiplications with subtotals and horizontal sums */
 			const __m128i y_factors = BGRX_Y_FACTORS;
@@ -1136,7 +1134,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 			                                   Y_SHIFT);
 			const __m128i ye = _mm_packus_epi16(ye1, ye2);
 			/* store y [b1] */
-			_mm_storeu_si128((__m128i*)yLumaDstEven, ye);
+			STORE_SI128(yLumaDstEven, ye);
 			yLumaDstEven += 16;
 		}

@@ -1150,7 +1148,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
 			                                                  _mm_maddubs_epi16(xo4, y_factors)),
 			                                   Y_SHIFT);
 			const __m128i yo = _mm_packus_epi16(yo1, yo2);
-			_mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
+			STORE_SI128(yLumaDstOdd, yo);
 			yLumaDstOdd += 16;
 		}

@@ -1470,22 +1468,22 @@ static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const
 			const __m128i unpackLow =
 			    _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
 			{
-				const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
+				const __m128i u = LOAD_SI128(&Um[x]);
 				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
 				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
-				_mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
-				_mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
-				_mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
-				_mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
+				STORE_SI128(&pU[2ULL * x], uHigh);
+				STORE_SI128(&pU[2ULL * x + 16], uLow);
+				STORE_SI128(&pU1[2ULL * x], uHigh);
+				STORE_SI128(&pU1[2ULL * x + 16], uLow);
 			}
 			{
-				const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
+				const __m128i u = LOAD_SI128(&Vm[x]);
 				const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
 				const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
-				_mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
-				_mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
-				_mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
-				_mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
+				STORE_SI128(&pV[2 * x], uHigh);
+				STORE_SI128(&pV[2 * x + 16], uLow);
+				STORE_SI128(&pV1[2 * x], uHigh);
+				STORE_SI128(&pV1[2 * x + 16], uLow);
 			}
 		}

@@ -1578,14 +1576,14 @@ static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
 		for (; x < halfWidth - halfPad; x += 16)
 		{
 			{
-				const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
+				const __m128i u = LOAD_SI128(&Ua[x]);
 				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
 				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
 				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
 				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
 			}
 			{
-				const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
+				const __m128i u = LOAD_SI128(&Va[x]);
 				const __m128i u2 = _mm_unpackhi_epi8(u, zero);
 				const __m128i u1 = _mm_unpacklo_epi8(u, zero);
 				_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
@@ -1641,14 +1639,14 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
 		for (; x < halfWidth - halfPad; x += 16)
 		{
 			{
-				const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
+				const __m128i u = LOAD_SI128(&pYaU[x]);
 				const __m128i u2 = _mm_unpackhi_epi8(zero, u);
 				const __m128i u1 = _mm_unpacklo_epi8(zero, u);
 				_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
 				_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
 			}
 			{
-				const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
+				const __m128i v = LOAD_SI128(&pYaV[x]);
 				const __m128i v2 = _mm_unpackhi_epi8(zero, v);
 				const __m128i v1 = _mm_unpacklo_epi8(zero, v);
 				_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
@@ -1678,8 +1676,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
 		for (; x < quaterWidth - quaterPad; x += 16)
 		{
 			{
-				const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
-				const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
+				const __m128i uU = LOAD_SI128(&pUaU[x]);
+				const __m128i uV = LOAD_SI128(&pVaU[x]);
 				const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
 				const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
 				const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
@@ -1692,8 +1690,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
 				_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
 			}
 			{
-				const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
-				const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
+				const __m128i vU = LOAD_SI128(&pUaV[x]);
+				const __m128i vV = LOAD_SI128(&pVaV[x]);
 				const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
 				const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
 				const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);
--- a/libfreerdp/primitives/sse/prim_add_sse3.c
+++ b/libfreerdp/primitives/sse/prim_add_sse3.c
@@ -75,29 +75,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
 			__m128i* vdptr1 = (__m128i*)dptr1;
 			__m128i* vdptr2 = (__m128i*)dptr2;

-			__m128i xmm0 = _mm_lddqu_si128(vsptr1++);
-			__m128i xmm1 = _mm_lddqu_si128(vsptr1++);
-			__m128i xmm2 = _mm_lddqu_si128(vsptr1++);
-			__m128i xmm3 = _mm_lddqu_si128(vsptr1++);
-			__m128i xmm4 = _mm_lddqu_si128(vsptr2++);
-			__m128i xmm5 = _mm_lddqu_si128(vsptr2++);
-			__m128i xmm6 = _mm_lddqu_si128(vsptr2++);
-			__m128i xmm7 = _mm_lddqu_si128(vsptr2++);
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);

 			xmm0 = _mm_adds_epi16(xmm0, xmm4);
 			xmm1 = _mm_adds_epi16(xmm1, xmm5);
 			xmm2 = _mm_adds_epi16(xmm2, xmm6);
 			xmm3 = _mm_adds_epi16(xmm3, xmm7);

-			_mm_store_si128(vdptr1++, xmm0);
-			_mm_store_si128(vdptr1++, xmm1);
-			_mm_store_si128(vdptr1++, xmm2);
-			_mm_store_si128(vdptr1++, xmm3);
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);

-			_mm_store_si128(vdptr2++, xmm0);
-			_mm_store_si128(vdptr2++, xmm1);
-			_mm_store_si128(vdptr2++, xmm2);
-			_mm_store_si128(vdptr2++, xmm3);
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);

 			dptr1 = (INT16*)vdptr1;
 			dptr2 = (INT16*)vdptr2;
@@ -113,29 +113,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
 			__m128i* vdptr1 = (__m128i*)dptr1;
 			__m128i* vdptr2 = (__m128i*)dptr2;

-			__m128i xmm0 = _mm_load_si128(vsptr1++);
-			__m128i xmm1 = _mm_load_si128(vsptr1++);
-			__m128i xmm2 = _mm_load_si128(vsptr1++);
-			__m128i xmm3 = _mm_load_si128(vsptr1++);
-			__m128i xmm4 = _mm_load_si128(vsptr2++);
-			__m128i xmm5 = _mm_load_si128(vsptr2++);
-			__m128i xmm6 = _mm_load_si128(vsptr2++);
-			__m128i xmm7 = _mm_load_si128(vsptr2++);
+			__m128i xmm0 = LOAD_SI128(vsptr1++);
+			__m128i xmm1 = LOAD_SI128(vsptr1++);
+			__m128i xmm2 = LOAD_SI128(vsptr1++);
+			__m128i xmm3 = LOAD_SI128(vsptr1++);
+			__m128i xmm4 = LOAD_SI128(vsptr2++);
+			__m128i xmm5 = LOAD_SI128(vsptr2++);
+			__m128i xmm6 = LOAD_SI128(vsptr2++);
+			__m128i xmm7 = LOAD_SI128(vsptr2++);

 			xmm0 = _mm_adds_epi16(xmm0, xmm4);
 			xmm1 = _mm_adds_epi16(xmm1, xmm5);
 			xmm2 = _mm_adds_epi16(xmm2, xmm6);
 			xmm3 = _mm_adds_epi16(xmm3, xmm7);

-			_mm_store_si128(vdptr1++, xmm0);
-			_mm_store_si128(vdptr1++, xmm1);
-			_mm_store_si128(vdptr1++, xmm2);
-			_mm_store_si128(vdptr1++, xmm3);
+			STORE_SI128(vdptr1++, xmm0);
+			STORE_SI128(vdptr1++, xmm1);
+			STORE_SI128(vdptr1++, xmm2);
+			STORE_SI128(vdptr1++, xmm3);

-			_mm_store_si128(vdptr2++, xmm0);
-			_mm_store_si128(vdptr2++, xmm1);
-			_mm_store_si128(vdptr2++, xmm2);
-			_mm_store_si128(vdptr2++, xmm3);
+			STORE_SI128(vdptr2++, xmm0);
+			STORE_SI128(vdptr2++, xmm1);
+			STORE_SI128(vdptr2++, xmm2);
+			STORE_SI128(vdptr2++, xmm3);

 			dptr1 = (INT16*)vdptr1;
 			dptr2 = (INT16*)vdptr2;
@@ -156,8 +156,8 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,

 		xmm0 = _mm_adds_epi16(xmm0, xmm1);

-		_mm_store_si128(vdptr1++, xmm0);
-		_mm_store_si128(vdptr2++, xmm0);
+		STORE_SI128(vdptr1++, xmm0);
+		STORE_SI128(vdptr2++, xmm0);

 		dptr1 = (INT16*)vdptr1;
 		dptr2 = (INT16*)vdptr2;
--- a/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
+++ b/libfreerdp/primitives/sse/prim_alphaComp_sse3.c
@@ -28,6 +28,7 @@

 #include "prim_alphaComp.h"

+#include "prim_internal.h"
 #include "prim_avxsse.h"

 /* ------------------------------------------------------------------------- */
@@ -171,7 +172,7 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr
 			xmm5 = _mm_and_si128(xmm5, xmm3);
 			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
 			xmm5 = _mm_packus_epi16(xmm5, xmm4);
-			_mm_store_si128((__m128i*)dptr, xmm5);
+			STORE_SI128(dptr, xmm5);
 			dptr += 4;
 		}

--- a/libfreerdp/primitives/sse/prim_avxsse.h
+++ b/libfreerdp/primitives/sse/prim_avxsse.h
@@ -19,13 +19,19 @@
 */
 #pragma once

-#include "prim_internal.h"
+#include <winpr/cast.h>
+
+#include "../../core/simd.h"

 #if defined(SSE_AVX_INTRINSICS_ENABLED)
 #include <emmintrin.h>
+#include <pmmintrin.h>
+
 static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
 {
-	return _mm_set_epi32((int32_t)val1, (int32_t)val2, (int32_t)val3, (int32_t)val4);
+	return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val3),
+	                     WINPR_CXX_COMPAT_CAST(int32_t, val4));
 }

 static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
@@ -33,31 +39,36 @@ static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint
                                  uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
                                  uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
 {
-	return _mm_set_epi8((int8_t)val1, (int8_t)val2, (int8_t)val3, (int8_t)val4, (int8_t)val5,
-	                    (int8_t)val6, (int8_t)val7, (int8_t)val8, (int8_t)val9, (int8_t)val10,
-	                    (int8_t)val11, (int8_t)val12, (int8_t)val13, (int8_t)val14, (int8_t)val15,
-	                    (int8_t)val16);
+	return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
+	                    WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
 }

 static inline __m128i mm_set1_epu32(uint32_t val)
 {
-	return _mm_set1_epi32((int32_t)val);
+	return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
 }

 static inline __m128i mm_set1_epu8(uint8_t val)
 {
-	return _mm_set1_epi8((int8_t)val);
+	return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
 }

-/* Use lddqu for unaligned; load for 16-byte aligned. */
 static inline __m128i LOAD_SI128(const void* ptr)
 {
-	const ULONG_PTR uptr = (const ULONG_PTR)ptr;
-	const __m128i* mptr = (const __m128i*)ptr;
-	if ((uptr & 0x0f) != 0)
-		return _mm_loadu_si128(mptr);
+	const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
+	return _mm_lddqu_si128(mptr);
+}

-	return _mm_load_si128(mptr);
+static inline void STORE_SI128(void* ptr, __m128i val)
+{
+	__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
+	_mm_storeu_si128(mptr, val);
 }

 #endif
--- a/libfreerdp/primitives/sse/prim_colors_sse2.c
+++ b/libfreerdp/primitives/sse/prim_colors_sse2.c
@@ -181,32 +181,32 @@ sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
 			 */
 			/* y = (y_r_buf[i] + 4096) >> 2 */
-			__m128i y = _mm_load_si128(y_buf + i);
+			__m128i y = LOAD_SI128(y_buf + i);
 			y = _mm_add_epi16(y, c4096);
 			y = _mm_srai_epi16(y, 2);
 			/* cb = cb_g_buf[i]; */
-			__m128i cb = _mm_load_si128(cb_buf + i);
+			__m128i cb = LOAD_SI128(cb_buf + i);
 			/* cr = cr_b_buf[i]; */
-			__m128i cr = _mm_load_si128(cr_buf + i);
+			__m128i cr = LOAD_SI128(cr_buf + i);
 			/* (y + HIWORD(cr*22986)) >> 3 */
 			__m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
 			r = _mm_srai_epi16(r, 3);
 			/* r_buf[i] = CLIP(r); */
 			mm_between_epi16(r, zero, max);
-			_mm_store_si128(r_buf + i, r);
+			STORE_SI128(r_buf + i, r);
 			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
 			__m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
 			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
 			g = _mm_srai_epi16(g, 3);
 			/* g_buf[i] = CLIP(g); */
 			mm_between_epi16(g, zero, max);
-			_mm_store_si128(g_buf + i, g);
+			STORE_SI128(g_buf + i, g);
 			/* (y + HIWORD(cb*28999)) >> 3 */
 			__m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
 			b = _mm_srai_epi16(b, 3);
 			/* b_buf[i] = CLIP(b); */
 			mm_between_epi16(b, zero, max);
-			_mm_store_si128(b_buf + i, b);
+			STORE_SI128(b_buf + i, b);
 		}

 		y_buf += srcbump;
@@ -291,15 +291,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
 			 */
 			/* y = (y_r_buf[i] + 4096) >> 2 */
-			__m128i y1 = _mm_load_si128((const __m128i*)y_buf);
+			__m128i y1 = LOAD_SI128(y_buf);
 			y_buf += step;
 			y1 = _mm_add_epi16(y1, c4096);
 			y1 = _mm_srai_epi16(y1, 2);
 			/* cb = cb_g_buf[i]; */
-			__m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
+			__m128i cb1 = LOAD_SI128(cb_buf);
 			cb_buf += step;
 			/* cr = cr_b_buf[i]; */
-			__m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
+			__m128i cr1 = LOAD_SI128(cr_buf);
 			cr_buf += step;
 			/* (y + HIWORD(cr*22986)) >> 3 */
 			__m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
@@ -317,15 +317,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 			b1 = _mm_srai_epi16(b1, 3);
 			/* b_buf[i] = CLIP(b); */
 			mm_between_epi16(b1, zero, max);
-			__m128i y2 = _mm_load_si128((const __m128i*)y_buf);
+			__m128i y2 = LOAD_SI128(y_buf);
 			y_buf += step;
 			y2 = _mm_add_epi16(y2, c4096);
 			y2 = _mm_srai_epi16(y2, 2);
 			/* cb = cb_g_buf[i]; */
-			__m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
+			__m128i cb2 = LOAD_SI128(cb_buf);
 			cb_buf += step;
 			/* cr = cr_b_buf[i]; */
-			__m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
+			__m128i cr2 = LOAD_SI128(cr_buf);
 			cr_buf += step;
 			/* (y + HIWORD(cr*22986)) >> 3 */
 			__m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
@@ -369,13 +369,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 				R2 = R3;                              /* R2 = R3               */
 				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = B5G5R5FFB4G4R4FF */
 				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = B7G7R7FFB6G6R6FF */
-				_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF      */
+				STORE_SI128(d_buf, R0);               /* B1G1R1FFB0G0R0FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF      */
+				STORE_SI128(d_buf, R4); /* B3G3R3FFB2G2R2FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF      */
+				STORE_SI128(d_buf, R2); /* B5G5R5FFB4G4R4FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF      */
+				STORE_SI128(d_buf, R3); /* B7G7R7FFB6G6R6FF      */
 				d_buf += sizeof(__m128i);
 			}
 		}
@@ -476,15 +476,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
 			 */
 			/* y = (y_r_buf[i] + 4096) >> 2 */
-			__m128i y1 = _mm_load_si128((const __m128i*)y_buf);
+			__m128i y1 = LOAD_SI128(y_buf);
 			y_buf += step;
 			y1 = _mm_add_epi16(y1, c4096);
 			y1 = _mm_srai_epi16(y1, 2);
 			/* cb = cb_g_buf[i]; */
-			__m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
+			__m128i cb1 = LOAD_SI128(cb_buf);
 			cb_buf += step;
 			/* cr = cr_b_buf[i]; */
-			__m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
+			__m128i cr1 = LOAD_SI128(cr_buf);
 			cr_buf += step;
 			/* (y + HIWORD(cr*22986)) >> 3 */
 			__m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
@@ -502,15 +502,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 			b1 = _mm_srai_epi16(b1, 3);
 			/* b_buf[i] = CLIP(b); */
 			mm_between_epi16(b1, zero, max);
-			__m128i y2 = _mm_load_si128((const __m128i*)y_buf);
+			__m128i y2 = LOAD_SI128(y_buf);
 			y_buf += step;
 			y2 = _mm_add_epi16(y2, c4096);
 			y2 = _mm_srai_epi16(y2, 2);
 			/* cb = cb_g_buf[i]; */
-			__m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
+			__m128i cb2 = LOAD_SI128(cb_buf);
 			cb_buf += step;
 			/* cr = cr_b_buf[i]; */
-			__m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
+			__m128i cr2 = LOAD_SI128(cr_buf);
 			cr_buf += step;
 			/* (y + HIWORD(cr*22986)) >> 3 */
 			__m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
@@ -554,13 +554,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
 				R2 = R3;                              /* R2 = R3               */
 				R2 = _mm_unpacklo_epi16(R1, R2);      /* R2 = R5G5B5FFR4G4B4FF */
 				R3 = _mm_unpackhi_epi16(R1, R3);      /* R3 = R7G7B7FFR6G6B6FF */
-				_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF      */
+				STORE_SI128(d_buf, R0);               /* R1G1B1FFR0G0B0FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF      */
+				STORE_SI128(d_buf, R4); /* R3G3B3FFR2G2B2FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF      */
+				STORE_SI128(d_buf, R2); /* R5G5B5FFR4G4B4FF      */
 				d_buf += sizeof(__m128i);
-				_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF      */
+				STORE_SI128(d_buf, R3); /* R7G7B7FFR6G6B6FF      */
 				d_buf += sizeof(__m128i);
 			}
 		}
@@ -694,9 +694,9 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
 			 * within the upper 16 bits we will also have to scale the RGB
 			 * values used in the multiplication by << 5+(16-n).
 			 */
-			__m128i r = _mm_load_si128(r_buf + i);
-			__m128i g = _mm_load_si128(g_buf + i);
-			__m128i b = _mm_load_si128(b_buf + i);
+			__m128i r = LOAD_SI128(r_buf + i);
+			__m128i g = LOAD_SI128(g_buf + i);
+			__m128i b = LOAD_SI128(b_buf + i);
 			/* r<<6; g<<6; b<<6 */
 			r = _mm_slli_epi16(r, 6);
 			g = _mm_slli_epi16(g, 6);
@@ -708,21 +708,21 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
 			y = _mm_add_epi16(y, min);
 			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
 			mm_between_epi16(y, min, max);
-			_mm_store_si128(y_buf + i, y);
+			STORE_SI128(y_buf + i, y);
 			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
 			__m128i cb = _mm_mulhi_epi16(r, cb_r);
 			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
 			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
 			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
 			mm_between_epi16(cb, min, max);
-			_mm_store_si128(cb_buf + i, cb);
+			STORE_SI128(cb_buf + i, cb);
 			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
 			__m128i cr = _mm_mulhi_epi16(r, cr_r);
 			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
 			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
 			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
 			mm_between_epi16(cr, min, max);
-			_mm_store_si128(cr_buf + i, cr);
+			STORE_SI128(cr_buf + i, cr);
 		}

 		y_buf += srcbump;
@@ -769,27 +769,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pb);
+				R0 = LOAD_SI128(pb);
 				pb += 8; /* R0 = 00B300B200B100B0 */
-				R1 = _mm_load_si128((const __m128i*)pb);
+				R1 = LOAD_SI128(pb);
 				pb += 8;                      /* R1 = 00B700B600B500B4 */
 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pg);
+				R0 = LOAD_SI128(pg);
 				pg += 8; /* R1 = 00G300G200G100G0 */
-				R1 = _mm_load_si128((const __m128i*)pg);
+				R1 = LOAD_SI128(pg);
 				pg += 8;                      /* R2 = 00G700G600G500G4 */
 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pr);
+				R0 = LOAD_SI128(pr);
 				pr += 8; /* R0 = 00R300R200R100R0 */
-				R1 = _mm_load_si128((const __m128i*)pr);
+				R1 = LOAD_SI128(pr);
 				pr += 8;                      /* R3 = 00R700R600R500R4 */
 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
 			}
@@ -801,22 +801,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(

 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR1G1B1FFR0G0B0      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR3G3B3FFR2G2B2      */
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR5G5B5FFR4G4B4      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR7G7B7FFR6G6B6      */
 				}
 			}
@@ -875,27 +875,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pb);
+				R0 = LOAD_SI128(pb);
 				pb += 8; /* R0 = 00B300B200B100B0 */
-				R1 = _mm_load_si128((const __m128i*)pb);
+				R1 = LOAD_SI128(pb);
 				pb += 8;                      /* R1 = 00B700B600B500B4 */
 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pg);
+				R0 = LOAD_SI128(pg);
 				pg += 8; /* R1 = 00G300G200G100G0 */
-				R1 = _mm_load_si128((const __m128i*)pg);
+				R1 = LOAD_SI128(pg);
 				pg += 8;                      /* R2 = 00G700G600G500G4 */
 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pr);
+				R0 = LOAD_SI128(pr);
 				pr += 8; /* R0 = 00R300R200R100R0 */
-				R1 = _mm_load_si128((const __m128i*)pr);
+				R1 = LOAD_SI128(pr);
 				pr += 8;                      /* R3 = 00R700R600R500R4 */
 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
 			}
@@ -912,22 +912,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR1G1B1FFR0G0B0      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR3G3B3FFR2G2B2      */
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR5G5B5FFR4G4B4      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR7G7B7FFR6G6B6      */
 				}
 			}
@@ -986,27 +986,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pb);
+				R0 = LOAD_SI128(pb);
 				pb += 8; /* R0 = 00B300B200B100B0 */
-				R1 = _mm_load_si128((const __m128i*)pb);
+				R1 = LOAD_SI128(pb);
 				pb += 8;                      /* R1 = 00B700B600B500B4 */
 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pg);
+				R0 = LOAD_SI128(pg);
 				pg += 8; /* R1 = 00G300G200G100G0 */
-				R1 = _mm_load_si128((const __m128i*)pg);
+				R1 = LOAD_SI128(pg);
 				pg += 8;                      /* R2 = 00G700G600G500G4 */
 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pr);
+				R0 = LOAD_SI128(pr);
 				pr += 8; /* R0 = 00R300R200R100R0 */
-				R1 = _mm_load_si128((const __m128i*)pr);
+				R1 = LOAD_SI128(pr);
 				pr += 8;                      /* R3 = 00R700R600R500R4 */
 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
 			}
@@ -1023,22 +1023,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR1G1B1FFR0G0B0      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR3G3B3FFR2G2B2      */
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR5G5B5FFR4G4B4      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR7G7B7FFR6G6B6      */
 				}
 			}
@@ -1097,27 +1097,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pb);
+				R0 = LOAD_SI128(pb);
 				pb += 8; /* R0 = 00B300B200B100B0 */
-				R1 = _mm_load_si128((const __m128i*)pb);
+				R1 = LOAD_SI128(pb);
 				pb += 8;                      /* R1 = 00B700B600B500B4 */
 				b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pg);
+				R0 = LOAD_SI128(pg);
 				pg += 8; /* R1 = 00G300G200G100G0 */
-				R1 = _mm_load_si128((const __m128i*)pg);
+				R1 = LOAD_SI128(pg);
 				pg += 8;                      /* R2 = 00G700G600G500G4 */
 				g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
 			}
 			{
 				__m128i R0;
 				__m128i R1;
-				R0 = _mm_load_si128((const __m128i*)pr);
+				R0 = LOAD_SI128(pr);
 				pr += 8; /* R0 = 00R300R200R100R0 */
-				R1 = _mm_load_si128((const __m128i*)pr);
+				R1 = LOAD_SI128(pr);
 				pr += 8;                      /* R3 = 00R700R600R500R4 */
 				r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
 			}
@@ -1134,22 +1134,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR1G1B1FFR0G0B0      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR3G3B3FFR2G2B2      */
 				}
 				{
 					const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR5G5B5FFR4G4B4      */
 				}
 				{
 					const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
-					_mm_store_si128((__m128i*)out, bgrx);
+					STORE_SI128(out, bgrx);
 					out += 16; /* FFR7G7B7FFR6G6B6      */
 				}
 			}
@@ -1217,7 +1217,8 @@ void primitives_init_colors_sse2(primitives_t* prims)
 	generic = primitives_get_generic();
 	primitives_init_colors(prims);

-	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 	{
 		WLog_VRB(PRIM_TAG, "SSE2 optimizations");
 		prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
--- a/libfreerdp/primitives/sse/prim_copy_sse4_1.c
+++ b/libfreerdp/primitives/sse/prim_copy_sse4_1.c
@@ -22,6 +22,7 @@
 #include <freerdp/primitives.h>
 #include <freerdp/log.h>

+#include "prim_internal.h"
 #include "prim_avxsse.h"
 #include "prim_copy.h"
 #include "../codec/color.h"
@@ -68,12 +69,12 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
 			{
 				const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
 				__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
-				const __m128i s0 = _mm_loadu_si128(src);
+				const __m128i s0 = LOAD_SI128(src);
 				const __m128i s1 = _mm_shuffle_epi8(s0, smask);
-				const __m128i s2 = _mm_loadu_si128(dst);
+				const __m128i s2 = LOAD_SI128(dst);

 				__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
-				_mm_storeu_si128(dst, d0);
+				STORE_SI128(dst, d0);
 			}
 		}
 		for (; x < nWidth; x++)
@@ -118,10 +119,10 @@ static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstDa
 		{
 			const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
 			__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
-			const __m128i s0 = _mm_loadu_si128(src);
-			const __m128i s1 = _mm_loadu_si128(dst);
+			const __m128i s0 = LOAD_SI128(src);
+			const __m128i s1 = LOAD_SI128(dst);
 			__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
-			_mm_storeu_si128(dst, d0);
+			STORE_SI128(dst, d0);
 		}

 		for (; x < nWidth; x++)
--- a/libfreerdp/primitives/sse/prim_set_sse2.c
+++ b/libfreerdp/primitives/sse/prim_set_sse2.c
@@ -21,7 +21,8 @@
 #include <freerdp/primitives.h>
 #include <winpr/sysinfo.h>

-#include "prim_avxsse.h".h "
+#include "prim_internal.h"
+#include "prim_avxsse.h"
 #include "prim_set.h"

 /* ========================================================================= */
@@ -60,37 +61,37 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
 	/* Do 256-byte chunks using one XMM register. */
 	while (count--)
 	{
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
 	}

@@ -101,7 +102,7 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
 	/* Do 16-byte chunks using one XMM register. */
 	while (count--)
 	{
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 16;
 	}

@@ -152,37 +153,37 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le
 	/* Do 256-byte chunks using one XMM register. */
 	while (count--)
 	{
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
 	}

@@ -193,7 +194,7 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le
 	/* Do 16-byte chunks using one XMM register. */
 	while (count--)
 	{
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 4;
 	}

@@ -220,7 +221,8 @@ void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
 	primitives_init_set(prims);
 	/* Pick tuned versions if possible. */

-	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
+	if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
+	    IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
 	{
 		WLog_VRB(PRIM_TAG, "SSE2 optimizations");
 		prims->set_8u = sse2_set_8u;
--- a/libfreerdp/primitives/sse/prim_shift_sse3.c
+++ b/libfreerdp/primitives/sse/prim_shift_sse3.c
@@ -79,14 +79,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
 	{
 		const __m128i* src = (const __m128i*)pSrcDst;

-		__m128i xmm0 = _mm_load_si128(src++);
-		__m128i xmm1 = _mm_load_si128(src++);
-		__m128i xmm2 = _mm_load_si128(src++);
-		__m128i xmm3 = _mm_load_si128(src++);
-		__m128i xmm4 = _mm_load_si128(src++);
-		__m128i xmm5 = _mm_load_si128(src++);
-		__m128i xmm6 = _mm_load_si128(src++);
-		__m128i xmm7 = _mm_load_si128(src);
+		__m128i xmm0 = LOAD_SI128(src++);
+		__m128i xmm1 = LOAD_SI128(src++);
+		__m128i xmm2 = LOAD_SI128(src++);
+		__m128i xmm3 = LOAD_SI128(src++);
+		__m128i xmm4 = LOAD_SI128(src++);
+		__m128i xmm5 = LOAD_SI128(src++);
+		__m128i xmm6 = LOAD_SI128(src++);
+		__m128i xmm7 = LOAD_SI128(src);

 		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
 		xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
@@ -99,14 +99,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32

 		__m128i* dst = (__m128i*)pSrcDst;

-		_mm_store_si128(dst++, xmm0);
-		_mm_store_si128(dst++, xmm1);
-		_mm_store_si128(dst++, xmm2);
-		_mm_store_si128(dst++, xmm3);
-		_mm_store_si128(dst++, xmm4);
-		_mm_store_si128(dst++, xmm5);
-		_mm_store_si128(dst++, xmm6);
-		_mm_store_si128(dst++, xmm7);
+		STORE_SI128(dst++, xmm0);
+		STORE_SI128(dst++, xmm1);
+		STORE_SI128(dst++, xmm2);
+		STORE_SI128(dst++, xmm3);
+		STORE_SI128(dst++, xmm4);
+		STORE_SI128(dst++, xmm5);
+		STORE_SI128(dst++, xmm6);
+		STORE_SI128(dst++, xmm7);

 		pSrcDst = (INT16*)dst;
 	}
@@ -122,7 +122,7 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
 		xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);

 		__m128i* dst = (__m128i*)pSrcDst;
-		_mm_store_si128(dst++, xmm0);
+		STORE_SI128(dst++, xmm0);
 		pSrcDst = (INT16*)dst;
 	}

--- a/libfreerdp/primitives/sse/prim_sign_ssse3.c
+++ b/libfreerdp/primitives/sse/prim_sign_ssse3.c
@@ -21,6 +21,7 @@

 #include "prim_sign.h"

+#include "prim_internal.h"
 #include "prim_avxsse.h"

 #if defined(SSE_AVX_INTRINSICS_ENABLED)
@@ -79,25 +80,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
 			xmm1 = _mm_set1_epi16(0x0001U);
 			xmm2 = _mm_set1_epi16(0x0001U);
 			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
+			xmm4 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
+			xmm5 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
+			xmm6 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
+			xmm7 = LOAD_SI128(sptr);
 			sptr += 8;
 			xmm0 = _mm_sign_epi16(xmm0, xmm4);
 			xmm1 = _mm_sign_epi16(xmm1, xmm5);
 			xmm2 = _mm_sign_epi16(xmm2, xmm6);
 			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i*)dptr, xmm0);
+			STORE_SI128(dptr, xmm0);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm1);
+			STORE_SI128(dptr, xmm1);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm2);
+			STORE_SI128(dptr, xmm2);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm3);
+			STORE_SI128(dptr, xmm3);
 			dptr += 8;
 		}
 	}
@@ -118,25 +119,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
 			xmm1 = _mm_set1_epi16(0x0001U);
 			xmm2 = _mm_set1_epi16(0x0001U);
 			xmm3 = _mm_set1_epi16(0x0001U);
-			xmm4 = _mm_load_si128((const __m128i*)sptr);
+			xmm4 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm5 = _mm_load_si128((const __m128i*)sptr);
+			xmm5 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm6 = _mm_load_si128((const __m128i*)sptr);
+			xmm6 = LOAD_SI128(sptr);
 			sptr += 8;
-			xmm7 = _mm_load_si128((const __m128i*)sptr);
+			xmm7 = LOAD_SI128(sptr);
 			sptr += 8;
 			xmm0 = _mm_sign_epi16(xmm0, xmm4);
 			xmm1 = _mm_sign_epi16(xmm1, xmm5);
 			xmm2 = _mm_sign_epi16(xmm2, xmm6);
 			xmm3 = _mm_sign_epi16(xmm3, xmm7);
-			_mm_store_si128((__m128i*)dptr, xmm0);
+			STORE_SI128(dptr, xmm0);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm1);
+			STORE_SI128(dptr, xmm1);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm2);
+			STORE_SI128(dptr, xmm2);
 			dptr += 8;
-			_mm_store_si128((__m128i*)dptr, xmm3);
+			STORE_SI128(dptr, xmm3);
 			dptr += 8;
 		}
 	}
@@ -151,7 +152,7 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
 		__m128i xmm1 = LOAD_SI128(sptr);
 		sptr += 8;
 		xmm0 = _mm_sign_epi16(xmm0, xmm1);
-		_mm_store_si128((__m128i*)dptr, xmm0);
+		STORE_SI128(dptr, xmm0);
 		dptr += 8;
 	}