[primitives,sse] unify load/store

* Use LOAD_SI128 to load __m128i values
* Use STORE_SI128 to store __m128i values
This commit is contained in:
akallabeth
2025-02-11 15:47:55 +01:00
committed by Armin Novak
parent 76012aac42
commit fd13e9b919
14 changed files with 357 additions and 375 deletions

View File

@@ -41,20 +41,20 @@ set(CODEC_SRCS
yuv.c
)
set(CODEC_SSE2_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h)
set(CODEC_SSE3_SRCS sse/rfx_sse2.c sse/rfx_sse2.h sse/nsc_sse2.c sse/nsc_sse2.h)
set(CODEC_NEON_SRCS neon/rfx_neon.c neon/rfx_neon.h neon/nsc_neon.c neon/nsc_neon.h)
# Append initializers
set(CODEC_LIBS "")
list(APPEND CODEC_SRCS ${CODEC_SSE2_SRCS})
list(APPEND CODEC_SRCS ${CODEC_SSE3_SRCS})
list(APPEND CODEC_SRCS ${CODEC_NEON_SRCS})
include(CompilerDetect)
include(DetectIntrinsicSupport)
if(WITH_SIMD)
set_simd_source_file_properties("sse2" ${CODEC_SSE2_SRCS})
set_simd_source_file_properties("sse3" ${CODEC_SSE3_SRCS})
set_simd_source_file_properties("neon" ${CODEC_NEON_SRCS})
endif()

View File

@@ -26,6 +26,7 @@
#include "nsc_sse2.h"
#include "../../core/simd.h"
#include "../../primitives/sse/prim_avxsse.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <stdio.h>
@@ -290,13 +291,13 @@ static BOOL nsc_encode_argb_to_aycocg_sse2(NSC_CONTEXT* context, const BYTE* dat
cg_val = _mm_sub_epi16(cg_val, _mm_srai_epi16(b_val, 1));
cg_val = _mm_srai_epi16(cg_val, ccl);
y_val = _mm_packus_epi16(y_val, y_val);
_mm_storeu_si128((__m128i*)yplane, y_val);
STORE_SI128(yplane, y_val);
co_val = _mm_packs_epi16(co_val, co_val);
_mm_storeu_si128((__m128i*)coplane, co_val);
STORE_SI128(coplane, co_val);
cg_val = _mm_packs_epi16(cg_val, cg_val);
_mm_storeu_si128((__m128i*)cgplane, cg_val);
STORE_SI128(cgplane, cg_val);
a_val = _mm_packus_epi16(a_val, a_val);
_mm_storeu_si128((__m128i*)aplane, a_val);
STORE_SI128(aplane, a_val);
yplane += 8;
coplane += 8;
cgplane += 8;
@@ -354,21 +355,21 @@ static void nsc_encode_subsampling_sse2(NSC_CONTEXT* context)
for (UINT32 x = 0; x < tempWidth >> 1; x += 8)
{
t = _mm_loadu_si128((__m128i*)co_src0);
t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)co_src1));
t = LOAD_SI128(co_src0);
t = _mm_avg_epu8(t, LOAD_SI128(co_src1));
val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
val = _mm_packus_epi16(val, val);
_mm_storeu_si128((__m128i*)co_dst, val);
STORE_SI128(co_dst, val);
co_dst += 8;
co_src0 += 16;
co_src1 += 16;
t = _mm_loadu_si128((__m128i*)cg_src0);
t = _mm_avg_epu8(t, _mm_loadu_si128((__m128i*)cg_src1));
t = LOAD_SI128(cg_src0);
t = _mm_avg_epu8(t, LOAD_SI128(cg_src1));
val = _mm_and_si128(_mm_srli_si128(t, 1), mask);
val = _mm_avg_epu16(val, _mm_and_si128(t, mask));
val = _mm_packus_epi16(val, val);
_mm_storeu_si128((__m128i*)cg_dst, val);
STORE_SI128(cg_dst, val);
cg_dst += 8;
cg_src0 += 16;
cg_src1 += 16;
@@ -391,7 +392,8 @@ static BOOL nsc_encode_sse2(NSC_CONTEXT* context, const BYTE* data, UINT32 scanl
void nsc_init_sse2(NSC_CONTEXT* context)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
PROFILER_RENAME(context->priv->prof_nsc_encode, "nsc_encode_sse2")

View File

@@ -27,6 +27,7 @@
#include "rfx_sse2.h"
#include "../../core/simd.h"
#include "../../primitives/sse/prim_avxsse.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <stdio.h>
@@ -75,10 +76,10 @@ rfx_quantization_decode_block_sse2(INT16* WINPR_RESTRICT buffer, const size_t bu
do
{
const __m128i la = _mm_load_si128(ptr);
const __m128i la = LOAD_SI128(ptr);
const __m128i a = _mm_slli_epi16(la, WINPR_ASSERTING_INT_CAST(int, factor));
_mm_store_si128(ptr, a);
STORE_SI128(ptr, a);
ptr++;
} while (ptr < buf_end);
}
@@ -116,10 +117,10 @@ rfx_quantization_encode_block_sse2(INT16* WINPR_RESTRICT buffer, const unsigned
do
{
const __m128i la = _mm_load_si128(ptr);
const __m128i la = LOAD_SI128(ptr);
__m128i a = _mm_add_epi16(la, half);
a = _mm_srai_epi16(a, factor);
_mm_store_si128(ptr, a);
STORE_SI128(ptr, a);
ptr++;
} while (ptr < buf_end);
}
@@ -177,9 +178,9 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
for (size_t n = 0; n < subband_width; n += 8)
{
/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
__m128i l_n = _mm_load_si128((__m128i*)l_ptr);
__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
__m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - 1));
__m128i l_n = LOAD_SI128(l_ptr);
__m128i h_n = LOAD_SI128(h_ptr);
__m128i h_n_m = LOAD_SI128(h_ptr - 1);
if (n == 0)
{
@@ -191,7 +192,7 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
tmp_n = _mm_srai_epi16(tmp_n, 1);
const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
_mm_store_si128((__m128i*)l_ptr, dst_n);
STORE_SI128(l_ptr, dst_n);
l_ptr += 8;
h_ptr += 8;
}
@@ -203,10 +204,10 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
for (size_t n = 0; n < subband_width; n += 8)
{
/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
__m128i h_n = LOAD_SI128(h_ptr);
h_n = _mm_slli_epi16(h_n, 1);
__m128i dst_n = _mm_load_si128((__m128i*)(l_ptr));
__m128i dst_n_p = _mm_loadu_si128((__m128i*)(l_ptr + 1));
__m128i dst_n = LOAD_SI128(l_ptr);
__m128i dst_n_p = LOAD_SI128(l_ptr + 1);
if (n == subband_width - 8)
{
@@ -219,8 +220,8 @@ rfx_dwt_2d_decode_block_horiz_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRIC
tmp_n = _mm_add_epi16(tmp_n, h_n);
dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
_mm_store_si128((__m128i*)dst_ptr, dst1);
_mm_store_si128((__m128i*)(dst_ptr + 8), dst2);
STORE_SI128(dst_ptr, dst1);
STORE_SI128(dst_ptr + 8, dst2);
l_ptr += 8;
h_ptr += 8;
dst_ptr += 16;
@@ -243,21 +244,21 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
for (size_t x = 0; x < total_width; x += 8)
{
/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
const __m128i l_n = _mm_load_si128((__m128i*)l_ptr);
const __m128i h_n = _mm_load_si128((__m128i*)h_ptr);
const __m128i l_n = LOAD_SI128(l_ptr);
const __m128i h_n = LOAD_SI128(h_ptr);
__m128i tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));
if (n == 0)
tmp_n = _mm_add_epi16(tmp_n, h_n);
else
{
const __m128i h_n_m = _mm_loadu_si128((__m128i*)(h_ptr - total_width));
const __m128i h_n_m = LOAD_SI128(h_ptr - total_width);
tmp_n = _mm_add_epi16(tmp_n, h_n_m);
}
tmp_n = _mm_srai_epi16(tmp_n, 1);
const __m128i dst_n = _mm_sub_epi16(l_n, tmp_n);
_mm_store_si128((__m128i*)dst_ptr, dst_n);
STORE_SI128(dst_ptr, dst_n);
l_ptr += 8;
h_ptr += 8;
dst_ptr += 8;
@@ -275,8 +276,8 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
for (size_t x = 0; x < total_width; x += 8)
{
/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
__m128i h_n = _mm_load_si128((__m128i*)h_ptr);
__m128i dst_n_m = _mm_load_si128((__m128i*)(dst_ptr - total_width));
__m128i h_n = LOAD_SI128(h_ptr);
__m128i dst_n_m = LOAD_SI128(dst_ptr - total_width);
h_n = _mm_slli_epi16(h_n, 1);
__m128i tmp_n = dst_n_m;
@@ -284,13 +285,13 @@ rfx_dwt_2d_decode_block_vert_sse2(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT
tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
else
{
const __m128i dst_n_p = _mm_loadu_si128((__m128i*)(dst_ptr + total_width));
const __m128i dst_n_p = LOAD_SI128(dst_ptr + total_width);
tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
}
tmp_n = _mm_srai_epi16(tmp_n, 1);
const __m128i dst_n = _mm_add_epi16(tmp_n, h_n);
_mm_store_si128((__m128i*)dst_ptr, dst_n);
STORE_SI128(dst_ptr, dst_n);
h_ptr += 8;
dst_ptr += 8;
}
@@ -342,29 +343,29 @@ rfx_dwt_2d_encode_block_vert_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTRI
{
for (size_t x = 0; x < total_width; x += 8)
{
__m128i src_2n = _mm_load_si128((__m128i*)src);
__m128i src_2n_1 = _mm_load_si128((__m128i*)(src + total_width));
__m128i src_2n = LOAD_SI128(src);
__m128i src_2n_1 = LOAD_SI128(src + total_width);
__m128i src_2n_2 = src_2n;
if (n < subband_width - 1)
src_2n_2 = _mm_load_si128((__m128i*)(src + 2ULL * total_width));
src_2n_2 = LOAD_SI128(src + 2ULL * total_width);
/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */
__m128i h_n = _mm_add_epi16(src_2n, src_2n_2);
h_n = _mm_srai_epi16(h_n, 1);
h_n = _mm_sub_epi16(src_2n_1, h_n);
h_n = _mm_srai_epi16(h_n, 1);
_mm_store_si128((__m128i*)h, h_n);
STORE_SI128(h, h_n);
__m128i h_n_m = h_n;
if (n != 0)
h_n_m = _mm_load_si128((__m128i*)(h - total_width));
h_n_m = LOAD_SI128(h - total_width);
/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */
__m128i l_n = _mm_add_epi16(h_n_m, h_n);
l_n = _mm_srai_epi16(l_n, 1);
l_n = _mm_add_epi16(l_n, src_2n);
_mm_store_si128((__m128i*)l, l_n);
STORE_SI128(l, l_n);
src += 8;
l += 8;
h += 8;
@@ -396,8 +397,8 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR
h_n = _mm_srai_epi16(h_n, 1);
h_n = _mm_sub_epi16(src_2n_1, h_n);
h_n = _mm_srai_epi16(h_n, 1);
_mm_store_si128((__m128i*)h, h_n);
__m128i h_n_m = _mm_loadu_si128((__m128i*)(h - 1));
STORE_SI128(h, h_n);
__m128i h_n_m = LOAD_SI128(h - 1);
if (n == 0)
{
@@ -409,7 +410,7 @@ rfx_dwt_2d_encode_block_horiz_sse2(INT16* WINPR_RESTRICT src, INT16* WINPR_RESTR
__m128i l_n = _mm_add_epi16(h_n_m, h_n);
l_n = _mm_srai_epi16(l_n, 1);
l_n = _mm_add_epi16(l_n, src_2n);
_mm_store_si128((__m128i*)l, l_n);
STORE_SI128(l, l_n);
src += 16;
l += 8;
h += 8;
@@ -453,7 +454,8 @@ static void rfx_dwt_2d_encode_sse2(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RE
void rfx_init_sse2(RFX_CONTEXT* context)
{
#if defined(SSE_AVX_INTRINSICS_ENABLED)
if (!IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE))
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
PROFILER_RENAME(context->priv->prof_rfx_quantization_decode, "rfx_quantization_decode_sse2")

View File

@@ -25,9 +25,16 @@ set(PRIMITIVES_SRCS
prim_internal.h
)
set(PRIMITIVES_SSE2_SRCS sse/prim_colors_sse2.c sse/prim_set_sse2.c sse/prim_avxsse.h sse/prim_templates.h)
set(PRIMITIVES_SSE3_SRCS sse/prim_add_sse3.c sse/prim_alphaComp_sse3.c sse/prim_andor_sse3.c sse/prim_shift_sse3.c)
set(PRIMITIVES_SSE3_SRCS
sse/prim_avxsse.h
sse/prim_templates.h
sse/prim_colors_sse2.c
sse/prim_set_sse2.c
sse/prim_add_sse3.c
sse/prim_alphaComp_sse3.c
sse/prim_andor_sse3.c
sse/prim_shift_sse3.c
)
set(PRIMITIVES_SSSE3_SRCS sse/prim_sign_ssse3.c sse/prim_YCoCg_ssse3.c)
@@ -58,14 +65,8 @@ if(WITH_OPENCL)
freerdp_library_add(OpenCL::OpenCL)
endif()
set(PRIMITIVES_OPT_SRCS
${PRIMITIVES_NEON_SRCS}
${PRIMITIVES_SSE2_SRCS}
${PRIMITIVES_SSE3_SRCS}
${PRIMITIVES_SSSE3_SRCS}
${PRIMITIVES_SSE4_1_SRCS}
${PRIMITIVES_SSE4_2_SRCS}
${PRIMITIVES_OPENCL_SRCS}
set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_NEON_SRCS} ${PRIMITIVES_SSE3_SRCS} ${PRIMITIVES_SSSE3_SRCS}
${PRIMITIVES_SSE4_1_SRCS} ${PRIMITIVES_SSE4_2_SRCS} ${PRIMITIVES_OPENCL_SRCS}
)
if(WITH_AVX2)
@@ -80,7 +81,6 @@ add_library(freerdp-primitives OBJECT ${PRIMITIVES_SRCS})
include(CompilerDetect)
include(DetectIntrinsicSupport)
if(WITH_SIMD)
set_simd_source_file_properties("sse2" ${PRIMITIVES_SSE2_SRCS})
set_simd_source_file_properties("sse3" ${PRIMITIVES_SSE3_SRCS})
set_simd_source_file_properties("ssse3" ${PRIMITIVES_SSSE3_SRCS})
set_simd_source_file_properties("sse4.1" ${PRIMITIVES_SSE4_1_SRCS})

View File

@@ -73,7 +73,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
BOOL onStride = 0;
/* Get to a 16-byte destination boundary. */
if ((ULONG_PTR)dptr & 0x0f)
@@ -96,9 +95,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
w -= startup;
}
/* Each loop handles eight pixels at a time. */
onStride = (((ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
while (w >= 8)
{
__m128i R0;
@@ -110,22 +106,10 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
__m128i R6;
__m128i R7;
if (onStride)
{
/* The faster path, 16-byte aligned load. */
R0 = _mm_load_si128((const __m128i*)sptr);
sptr += (128 / 8);
R1 = _mm_load_si128((const __m128i*)sptr);
sptr += (128 / 8);
}
else
{
/* Off-stride, slower LDDQU load. */
R0 = _mm_lddqu_si128((const __m128i*)sptr);
sptr += (128 / 8);
R1 = _mm_lddqu_si128((const __m128i*)sptr);
sptr += (128 / 8);
}
R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
@@ -197,9 +181,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(const BYTE* WINPR_RESTRICT pSr
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
_mm_store_si128((__m128i*)dptr, R4);
STORE_SI128(dptr, R4);
dptr += (128 / 8);
_mm_store_si128((__m128i*)dptr, R5);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
@@ -262,7 +246,6 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
for (UINT32 h = 0; h < height; h++)
{
UINT32 w = width;
BOOL onStride = 0;
/* Get to a 16-byte destination boundary. */
if ((ULONG_PTR)dptr & 0x0f)
@@ -285,47 +268,26 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
w -= startup;
}
/* Each loop handles eight pixels at a time. */
onStride = (((const ULONG_PTR)sptr & 0x0f) == 0) ? TRUE : FALSE;
while (w >= 8)
{
__m128i R0;
__m128i R1;
__m128i R2;
__m128i R3;
__m128i R4;
__m128i R5;
__m128i R6;
__m128i R7;
if (onStride)
{
/* The faster path, 16-byte aligned load. */
R0 = _mm_load_si128((const __m128i*)sptr);
sptr += (128 / 8);
R1 = _mm_load_si128((const __m128i*)sptr);
sptr += (128 / 8);
}
else
{
/* Off-stride, slower LDDQU load. */
R0 = _mm_lddqu_si128((const __m128i*)sptr);
sptr += (128 / 8);
R1 = _mm_lddqu_si128((const __m128i*)sptr);
sptr += (128 / 8);
}
/* The faster path, 16-byte aligned load. */
__m128i R0 = LOAD_SI128(sptr);
sptr += (128 / 8);
__m128i R1 = LOAD_SI128(sptr);
sptr += (128 / 8);
/* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
/* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
/* Shuffle to pack all the like types together. */
R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
R3 = _mm_shuffle_epi8(R0, R2);
R4 = _mm_shuffle_epi8(R1, R2);
__m128i R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
__m128i R3 = _mm_shuffle_epi8(R0, R2);
__m128i R4 = _mm_shuffle_epi8(R1, R2);
/* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
/* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
R5 = _mm_unpackhi_epi32(R3, R4);
R6 = _mm_unpacklo_epi32(R3, R4);
__m128i R5 = _mm_unpackhi_epi32(R3, R4);
__m128i R6 = _mm_unpacklo_epi32(R3, R4);
/* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
/* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
@@ -390,9 +352,9 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(const BYTE* WINPR_RESTRICT
/* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
R5 = _mm_unpackhi_epi16(R2, R3);
/* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
_mm_store_si128((__m128i*)dptr, R4);
STORE_SI128(dptr, R4);
dptr += (128 / 8);
_mm_store_si128((__m128i*)dptr, R5);
STORE_SI128(dptr, R5);
dptr += (128 / 8);
w -= 8;
}
@@ -456,6 +418,7 @@ void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
primitives_init_YCoCg(prims);
if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");

View File

@@ -28,6 +28,7 @@
#include <freerdp/types.h>
#include <freerdp/primitives.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_YUV.h"
@@ -56,7 +57,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr
mm_set_epu32(0x80800380, 0x80800280, 0x80800180, 0x80800080),
mm_set_epu32(0x80808003, 0x80808002, 0x80808001, 0x80808000) };
const __m128i c128 = _mm_set1_epi16(128);
__m128i BGRX = _mm_and_si128(_mm_loadu_si128(dst),
__m128i BGRX = _mm_and_si128(LOAD_SI128(dst),
mm_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000));
{
__m128i C;
@@ -117,7 +118,7 @@ static inline __m128i* sse41_YUV444Pixel(__m128i* WINPR_RESTRICT dst, __m128i Yr
BGRX = _mm_or_si128(BGRX, packed);
}
}
_mm_storeu_si128(dst++, BGRX);
STORE_SI128(dst++, BGRX);
return dst;
}
@@ -140,9 +141,9 @@ static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[]
for (UINT32 x = 0; x < nWidth - pad; x += 16)
{
const __m128i Y = _mm_loadu_si128((const __m128i*)YData);
const __m128i uRaw = _mm_loadu_si128((const __m128i*)UData);
const __m128i vRaw = _mm_loadu_si128((const __m128i*)VData);
const __m128i Y = LOAD_SI128(YData);
const __m128i uRaw = LOAD_SI128(UData);
const __m128i vRaw = LOAD_SI128(VData);
const __m128i U = _mm_shuffle_epi8(uRaw, duplicate);
const __m128i V = _mm_shuffle_epi8(vRaw, duplicate);
YData += 16;
@@ -445,12 +446,9 @@ static inline pstatus_t sse41_YUV444ToRGB_8u_P3AC4R_BGRX_DOUBLE_ROW(
size_t x = 0;
for (; x < nWidth - pad; x += 16)
{
const __m128i Y[] = { _mm_loadu_si128((const __m128i*)&YData[0][x]),
_mm_loadu_si128((const __m128i*)&YData[1][x]) };
__m128i U[] = { _mm_loadu_si128((const __m128i*)&UData[0][x]),
_mm_loadu_si128((const __m128i*)&UData[1][x]) };
__m128i V[] = { _mm_loadu_si128((const __m128i*)&VData[0][x]),
_mm_loadu_si128((const __m128i*)&VData[1][x]) };
const __m128i Y[] = { LOAD_SI128(&YData[0][x]), LOAD_SI128(&YData[1][x]) };
__m128i U[] = { LOAD_SI128(&UData[0][x]), LOAD_SI128(&UData[1][x]) };
__m128i V[] = { LOAD_SI128(&VData[0][x]), LOAD_SI128(&VData[1][x]) };
BYTE* dstp[] = { &pDst[0][x * 4], &pDst[1][x * 4] };
sse41_BGRX_fillRGB(dstp, Y, U, V);
@@ -636,21 +634,21 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE
for (; x < width - width % 16; x += 16)
{
/* store 16 rgba pixels in 4 128 bit registers */
__m128i x0 = _mm_loadu_si128(argb++); // 1st 4 pixels
__m128i x0 = LOAD_SI128(argb++); // 1st 4 pixels
{
x0 = _mm_maddubs_epi16(x0, y_factors);
__m128i x1 = _mm_loadu_si128(argb++); // 2nd 4 pixels
__m128i x1 = LOAD_SI128(argb++); // 2nd 4 pixels
x1 = _mm_maddubs_epi16(x1, y_factors);
x0 = _mm_hadds_epi16(x0, x1);
x0 = _mm_srli_epi16(x0, Y_SHIFT);
}
__m128i x2 = _mm_loadu_si128(argb++); // 3rd 4 pixels
__m128i x2 = LOAD_SI128(argb++); // 3rd 4 pixels
{
x2 = _mm_maddubs_epi16(x2, y_factors);
__m128i x3 = _mm_loadu_si128(argb++); // 4th 4 pixels
__m128i x3 = LOAD_SI128(argb++); // 4th 4 pixels
x3 = _mm_maddubs_epi16(x3, y_factors);
x2 = _mm_hadds_epi16(x2, x3);
x2 = _mm_srli_epi16(x2, Y_SHIFT);
@@ -658,7 +656,7 @@ static INLINE void sse41_RGBToYUV420_BGRX_Y(const BYTE* WINPR_RESTRICT src, BYTE
x0 = _mm_packus_epi16(x0, x2);
/* save to y plane */
_mm_storeu_si128(ydst++, x0);
STORE_SI128(ydst++, x0);
}
for (; x < width; x++)
@@ -688,20 +686,20 @@ static INLINE void sse41_RGBToYUV420_BGRX_UV(const BYTE* WINPR_RESTRICT src1,
__m64* vdst = (__m64*)&dst2[x / 2];
/* subsample 16x2 pixels into 16x1 pixels */
__m128i x0 = _mm_loadu_si128(&rgb1[0]);
__m128i x4 = _mm_loadu_si128(&rgb2[0]);
__m128i x0 = LOAD_SI128(&rgb1[0]);
__m128i x4 = LOAD_SI128(&rgb2[0]);
x0 = _mm_avg_epu8(x0, x4);
__m128i x1 = _mm_loadu_si128(&rgb1[1]);
x4 = _mm_loadu_si128(&rgb2[1]);
__m128i x1 = LOAD_SI128(&rgb1[1]);
x4 = LOAD_SI128(&rgb2[1]);
x1 = _mm_avg_epu8(x1, x4);
__m128i x2 = _mm_loadu_si128(&rgb1[2]);
x4 = _mm_loadu_si128(&rgb2[2]);
__m128i x2 = LOAD_SI128(&rgb1[2]);
x4 = LOAD_SI128(&rgb2[2]);
x2 = _mm_avg_epu8(x2, x4);
__m128i x3 = _mm_loadu_si128(&rgb1[3]);
x4 = _mm_loadu_si128(&rgb2[3]);
__m128i x3 = LOAD_SI128(&rgb1[3]);
x4 = LOAD_SI128(&rgb2[3]);
x3 = _mm_avg_epu8(x3, x4);
/* subsample these 16x1 pixels into 8x1 pixels */
@@ -827,14 +825,14 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
for (; x < width - width % 16; x += 16)
{
/* store 16 rgba pixels in 4 128 bit registers */
const __m128i xe1 = _mm_loadu_si128(argbEven++); // 1st 4 pixels
const __m128i xe2 = _mm_loadu_si128(argbEven++); // 2nd 4 pixels
const __m128i xe3 = _mm_loadu_si128(argbEven++); // 3rd 4 pixels
const __m128i xe4 = _mm_loadu_si128(argbEven++); // 4th 4 pixels
const __m128i xo1 = _mm_loadu_si128(argbOdd++); // 1st 4 pixels
const __m128i xo2 = _mm_loadu_si128(argbOdd++); // 2nd 4 pixels
const __m128i xo3 = _mm_loadu_si128(argbOdd++); // 3rd 4 pixels
const __m128i xo4 = _mm_loadu_si128(argbOdd++); // 4th 4 pixels
const __m128i xe1 = LOAD_SI128(argbEven++); // 1st 4 pixels
const __m128i xe2 = LOAD_SI128(argbEven++); // 2nd 4 pixels
const __m128i xe3 = LOAD_SI128(argbEven++); // 3rd 4 pixels
const __m128i xe4 = LOAD_SI128(argbEven++); // 4th 4 pixels
const __m128i xo1 = LOAD_SI128(argbOdd++); // 1st 4 pixels
const __m128i xo2 = LOAD_SI128(argbOdd++); // 2nd 4 pixels
const __m128i xo3 = LOAD_SI128(argbOdd++); // 3rd 4 pixels
const __m128i xo4 = LOAD_SI128(argbOdd++); // 4th 4 pixels
{
/* Y: multiplications with subtotals and horizontal sums */
const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
@@ -852,12 +850,12 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
Y_SHIFT);
const __m128i yo = _mm_packus_epi16(yo1, yo2);
/* store y [b1] */
_mm_storeu_si128((__m128i*)b1Even, ye);
STORE_SI128(b1Even, ye);
b1Even += 16;
if (b1Odd)
{
_mm_storeu_si128((__m128i*)b1Odd, yo);
STORE_SI128(b1Odd, yo);
b1Odd += 16;
}
}
@@ -925,7 +923,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
if (b1Odd) /* b4 */
{
_mm_storeu_si128((__m128i*)b4, uo);
STORE_SI128(b4, uo);
b4 += 16;
}
@@ -1003,7 +1001,7 @@ static INLINE void sse41_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
if (b1Odd) /* b5 */
{
_mm_storeu_si128((__m128i*)b5, vo);
STORE_SI128(b5, vo);
b5 += 16;
}
@@ -1117,14 +1115,14 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
/* store 16 rgba pixels in 4 128 bit registers
* for even and odd rows.
*/
const __m128i xe1 = _mm_loadu_si128(argbEven++); /* 1st 4 pixels */
const __m128i xe2 = _mm_loadu_si128(argbEven++); /* 2nd 4 pixels */
const __m128i xe3 = _mm_loadu_si128(argbEven++); /* 3rd 4 pixels */
const __m128i xe4 = _mm_loadu_si128(argbEven++); /* 4th 4 pixels */
const __m128i xo1 = _mm_loadu_si128(argbOdd++); /* 1st 4 pixels */
const __m128i xo2 = _mm_loadu_si128(argbOdd++); /* 2nd 4 pixels */
const __m128i xo3 = _mm_loadu_si128(argbOdd++); /* 3rd 4 pixels */
const __m128i xo4 = _mm_loadu_si128(argbOdd++); /* 4th 4 pixels */
const __m128i xe1 = LOAD_SI128(argbEven++); /* 1st 4 pixels */
const __m128i xe2 = LOAD_SI128(argbEven++); /* 2nd 4 pixels */
const __m128i xe3 = LOAD_SI128(argbEven++); /* 3rd 4 pixels */
const __m128i xe4 = LOAD_SI128(argbEven++); /* 4th 4 pixels */
const __m128i xo1 = LOAD_SI128(argbOdd++); /* 1st 4 pixels */
const __m128i xo2 = LOAD_SI128(argbOdd++); /* 2nd 4 pixels */
const __m128i xo3 = LOAD_SI128(argbOdd++); /* 3rd 4 pixels */
const __m128i xo4 = LOAD_SI128(argbOdd++); /* 4th 4 pixels */
{
/* Y: multiplications with subtotals and horizontal sums */
const __m128i y_factors = BGRX_Y_FACTORS;
@@ -1136,7 +1134,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
Y_SHIFT);
const __m128i ye = _mm_packus_epi16(ye1, ye2);
/* store y [b1] */
_mm_storeu_si128((__m128i*)yLumaDstEven, ye);
STORE_SI128(yLumaDstEven, ye);
yLumaDstEven += 16;
}
@@ -1150,7 +1148,7 @@ static INLINE void sse41_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
_mm_maddubs_epi16(xo4, y_factors)),
Y_SHIFT);
const __m128i yo = _mm_packus_epi16(yo1, yo2);
_mm_storeu_si128((__m128i*)yLumaDstOdd, yo);
STORE_SI128(yLumaDstOdd, yo);
yLumaDstOdd += 16;
}
@@ -1470,22 +1468,22 @@ static pstatus_t sse41_LumaToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[], const
const __m128i unpackLow =
_mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8);
{
const __m128i u = _mm_loadu_si128((const __m128i*)&Um[x]);
const __m128i u = LOAD_SI128(&Um[x]);
const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
_mm_storeu_si128((__m128i*)&pU[2ULL * x], uHigh);
_mm_storeu_si128((__m128i*)&pU[2ULL * x + 16], uLow);
_mm_storeu_si128((__m128i*)&pU1[2ULL * x], uHigh);
_mm_storeu_si128((__m128i*)&pU1[2ULL * x + 16], uLow);
STORE_SI128(&pU[2ULL * x], uHigh);
STORE_SI128(&pU[2ULL * x + 16], uLow);
STORE_SI128(&pU1[2ULL * x], uHigh);
STORE_SI128(&pU1[2ULL * x + 16], uLow);
}
{
const __m128i u = _mm_loadu_si128((const __m128i*)&Vm[x]);
const __m128i u = LOAD_SI128(&Vm[x]);
const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh);
const __m128i uLow = _mm_shuffle_epi8(u, unpackLow);
_mm_storeu_si128((__m128i*)&pV[2 * x], uHigh);
_mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow);
_mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh);
_mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow);
STORE_SI128(&pV[2 * x], uHigh);
STORE_SI128(&pV[2 * x + 16], uLow);
STORE_SI128(&pV1[2 * x], uHigh);
STORE_SI128(&pV1[2 * x + 16], uLow);
}
}
@@ -1578,14 +1576,14 @@ static pstatus_t sse41_ChromaV1ToYUV444(const BYTE* WINPR_RESTRICT pSrcRaw[3],
for (; x < halfWidth - halfPad; x += 16)
{
{
const __m128i u = _mm_loadu_si128((const __m128i*)&Ua[x]);
const __m128i u = LOAD_SI128(&Ua[x]);
const __m128i u2 = _mm_unpackhi_epi8(u, zero);
const __m128i u1 = _mm_unpacklo_epi8(u, zero);
_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
}
{
const __m128i u = _mm_loadu_si128((const __m128i*)&Va[x]);
const __m128i u = LOAD_SI128(&Va[x]);
const __m128i u2 = _mm_unpackhi_epi8(u, zero);
const __m128i u1 = _mm_unpacklo_epi8(u, zero);
_mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]);
@@ -1641,14 +1639,14 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
for (; x < halfWidth - halfPad; x += 16)
{
{
const __m128i u = _mm_loadu_si128((const __m128i*)&pYaU[x]);
const __m128i u = LOAD_SI128(&pYaU[x]);
const __m128i u2 = _mm_unpackhi_epi8(zero, u);
const __m128i u1 = _mm_unpacklo_epi8(zero, u);
_mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]);
_mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]);
}
{
const __m128i v = _mm_loadu_si128((const __m128i*)&pYaV[x]);
const __m128i v = LOAD_SI128(&pYaV[x]);
const __m128i v2 = _mm_unpackhi_epi8(zero, v);
const __m128i v1 = _mm_unpacklo_epi8(zero, v);
_mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]);
@@ -1678,8 +1676,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
for (; x < quaterWidth - quaterPad; x += 16)
{
{
const __m128i uU = _mm_loadu_si128((const __m128i*)&pUaU[x]);
const __m128i uV = _mm_loadu_si128((const __m128i*)&pVaU[x]);
const __m128i uU = LOAD_SI128(&pUaU[x]);
const __m128i uV = LOAD_SI128(&pVaU[x]);
const __m128i uHigh = _mm_unpackhi_epi8(uU, uV);
const __m128i uLow = _mm_unpacklo_epi8(uU, uV);
const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2);
@@ -1692,8 +1690,8 @@ static pstatus_t sse41_ChromaV2ToYUV444(const BYTE* WINPR_RESTRICT pSrc[3], cons
_mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]);
}
{
const __m128i vU = _mm_loadu_si128((const __m128i*)&pUaV[x]);
const __m128i vV = _mm_loadu_si128((const __m128i*)&pVaV[x]);
const __m128i vU = LOAD_SI128(&pUaV[x]);
const __m128i vV = LOAD_SI128(&pVaV[x]);
const __m128i vHigh = _mm_unpackhi_epi8(vU, vV);
const __m128i vLow = _mm_unpacklo_epi8(vU, vV);
const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2);

View File

@@ -75,29 +75,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = _mm_lddqu_si128(vsptr1++);
__m128i xmm1 = _mm_lddqu_si128(vsptr1++);
__m128i xmm2 = _mm_lddqu_si128(vsptr1++);
__m128i xmm3 = _mm_lddqu_si128(vsptr1++);
__m128i xmm4 = _mm_lddqu_si128(vsptr2++);
__m128i xmm5 = _mm_lddqu_si128(vsptr2++);
__m128i xmm6 = _mm_lddqu_si128(vsptr2++);
__m128i xmm7 = _mm_lddqu_si128(vsptr2++);
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(vdptr1++, xmm0);
_mm_store_si128(vdptr1++, xmm1);
_mm_store_si128(vdptr1++, xmm2);
_mm_store_si128(vdptr1++, xmm3);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
_mm_store_si128(vdptr2++, xmm0);
_mm_store_si128(vdptr2++, xmm1);
_mm_store_si128(vdptr2++, xmm2);
_mm_store_si128(vdptr2++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
@@ -113,29 +113,29 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
__m128i* vdptr1 = (__m128i*)dptr1;
__m128i* vdptr2 = (__m128i*)dptr2;
__m128i xmm0 = _mm_load_si128(vsptr1++);
__m128i xmm1 = _mm_load_si128(vsptr1++);
__m128i xmm2 = _mm_load_si128(vsptr1++);
__m128i xmm3 = _mm_load_si128(vsptr1++);
__m128i xmm4 = _mm_load_si128(vsptr2++);
__m128i xmm5 = _mm_load_si128(vsptr2++);
__m128i xmm6 = _mm_load_si128(vsptr2++);
__m128i xmm7 = _mm_load_si128(vsptr2++);
__m128i xmm0 = LOAD_SI128(vsptr1++);
__m128i xmm1 = LOAD_SI128(vsptr1++);
__m128i xmm2 = LOAD_SI128(vsptr1++);
__m128i xmm3 = LOAD_SI128(vsptr1++);
__m128i xmm4 = LOAD_SI128(vsptr2++);
__m128i xmm5 = LOAD_SI128(vsptr2++);
__m128i xmm6 = LOAD_SI128(vsptr2++);
__m128i xmm7 = LOAD_SI128(vsptr2++);
xmm0 = _mm_adds_epi16(xmm0, xmm4);
xmm1 = _mm_adds_epi16(xmm1, xmm5);
xmm2 = _mm_adds_epi16(xmm2, xmm6);
xmm3 = _mm_adds_epi16(xmm3, xmm7);
_mm_store_si128(vdptr1++, xmm0);
_mm_store_si128(vdptr1++, xmm1);
_mm_store_si128(vdptr1++, xmm2);
_mm_store_si128(vdptr1++, xmm3);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr1++, xmm1);
STORE_SI128(vdptr1++, xmm2);
STORE_SI128(vdptr1++, xmm3);
_mm_store_si128(vdptr2++, xmm0);
_mm_store_si128(vdptr2++, xmm1);
_mm_store_si128(vdptr2++, xmm2);
_mm_store_si128(vdptr2++, xmm3);
STORE_SI128(vdptr2++, xmm0);
STORE_SI128(vdptr2++, xmm1);
STORE_SI128(vdptr2++, xmm2);
STORE_SI128(vdptr2++, xmm3);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;
@@ -156,8 +156,8 @@ static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1,
xmm0 = _mm_adds_epi16(xmm0, xmm1);
_mm_store_si128(vdptr1++, xmm0);
_mm_store_si128(vdptr2++, xmm0);
STORE_SI128(vdptr1++, xmm0);
STORE_SI128(vdptr2++, xmm0);
dptr1 = (INT16*)vdptr1;
dptr2 = (INT16*)vdptr2;

View File

@@ -28,6 +28,7 @@
#include "prim_alphaComp.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
/* ------------------------------------------------------------------------- */
@@ -171,7 +172,7 @@ static pstatus_t sse2_alphaComp_argb(const BYTE* WINPR_RESTRICT pSrc1, UINT32 sr
xmm5 = _mm_and_si128(xmm5, xmm3);
/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
xmm5 = _mm_packus_epi16(xmm5, xmm4);
_mm_store_si128((__m128i*)dptr, xmm5);
STORE_SI128(dptr, xmm5);
dptr += 4;
}

View File

@@ -19,13 +19,19 @@
*/
#pragma once
#include "prim_internal.h"
#include <winpr/cast.h>
#include "../../core/simd.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
#include <emmintrin.h>
#include <pmmintrin.h>
static inline __m128i mm_set_epu32(uint32_t val1, uint32_t val2, uint32_t val3, uint32_t val4)
{
return _mm_set_epi32((int32_t)val1, (int32_t)val2, (int32_t)val3, (int32_t)val4);
return _mm_set_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val1), WINPR_CXX_COMPAT_CAST(int32_t, val2),
WINPR_CXX_COMPAT_CAST(int32_t, val3),
WINPR_CXX_COMPAT_CAST(int32_t, val4));
}
static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint8_t val4,
@@ -33,31 +39,36 @@ static inline __m128i mm_set_epu8(uint8_t val1, uint8_t val2, uint8_t val3, uint
uint8_t val9, uint8_t val10, uint8_t val11, uint8_t val12,
uint8_t val13, uint8_t val14, uint8_t val15, uint8_t val16)
{
return _mm_set_epi8((int8_t)val1, (int8_t)val2, (int8_t)val3, (int8_t)val4, (int8_t)val5,
(int8_t)val6, (int8_t)val7, (int8_t)val8, (int8_t)val9, (int8_t)val10,
(int8_t)val11, (int8_t)val12, (int8_t)val13, (int8_t)val14, (int8_t)val15,
(int8_t)val16);
return _mm_set_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val1), WINPR_CXX_COMPAT_CAST(int8_t, val2),
WINPR_CXX_COMPAT_CAST(int8_t, val3), WINPR_CXX_COMPAT_CAST(int8_t, val4),
WINPR_CXX_COMPAT_CAST(int8_t, val5), WINPR_CXX_COMPAT_CAST(int8_t, val6),
WINPR_CXX_COMPAT_CAST(int8_t, val7), WINPR_CXX_COMPAT_CAST(int8_t, val8),
WINPR_CXX_COMPAT_CAST(int8_t, val9), WINPR_CXX_COMPAT_CAST(int8_t, val10),
WINPR_CXX_COMPAT_CAST(int8_t, val11), WINPR_CXX_COMPAT_CAST(int8_t, val12),
WINPR_CXX_COMPAT_CAST(int8_t, val13), WINPR_CXX_COMPAT_CAST(int8_t, val14),
WINPR_CXX_COMPAT_CAST(int8_t, val15), WINPR_CXX_COMPAT_CAST(int8_t, val16));
}
static inline __m128i mm_set1_epu32(uint32_t val)
{
return _mm_set1_epi32((int32_t)val);
return _mm_set1_epi32(WINPR_CXX_COMPAT_CAST(int32_t, val));
}
static inline __m128i mm_set1_epu8(uint8_t val)
{
return _mm_set1_epi8((int8_t)val);
return _mm_set1_epi8(WINPR_CXX_COMPAT_CAST(int8_t, val));
}
/* Use lddqu for unaligned; load for 16-byte aligned. */
static inline __m128i LOAD_SI128(const void* ptr)
{
const ULONG_PTR uptr = (const ULONG_PTR)ptr;
const __m128i* mptr = (const __m128i*)ptr;
if ((uptr & 0x0f) != 0)
return _mm_loadu_si128(mptr);
const __m128i* mptr = WINPR_CXX_COMPAT_CAST(const __m128i*, ptr);
return _mm_lddqu_si128(mptr);
}
return _mm_load_si128(mptr);
static inline void STORE_SI128(void* ptr, __m128i val)
{
__m128i* mptr = WINPR_CXX_COMPAT_CAST(__m128i*, ptr);
_mm_storeu_si128(mptr, val);
}
#endif

View File

@@ -181,32 +181,32 @@ sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y = _mm_load_si128(y_buf + i);
__m128i y = LOAD_SI128(y_buf + i);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
__m128i cb = _mm_load_si128(cb_buf + i);
__m128i cb = LOAD_SI128(cb_buf + i);
/* cr = cr_b_buf[i]; */
__m128i cr = _mm_load_si128(cr_buf + i);
__m128i cr = LOAD_SI128(cr_buf + i);
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* r_buf[i] = CLIP(r); */
mm_between_epi16(r, zero, max);
_mm_store_si128(r_buf + i, r);
STORE_SI128(r_buf + i, r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
__m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* g_buf[i] = CLIP(g); */
mm_between_epi16(g, zero, max);
_mm_store_si128(g_buf + i, g);
STORE_SI128(g_buf + i, g);
/* (y + HIWORD(cb*28999)) >> 3 */
__m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* b_buf[i] = CLIP(b); */
mm_between_epi16(b, zero, max);
_mm_store_si128(b_buf + i, b);
STORE_SI128(b_buf + i, b);
}
y_buf += srcbump;
@@ -291,15 +291,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y1 = _mm_load_si128((const __m128i*)y_buf);
__m128i y1 = LOAD_SI128(y_buf);
y_buf += step;
y1 = _mm_add_epi16(y1, c4096);
y1 = _mm_srai_epi16(y1, 2);
/* cb = cb_g_buf[i]; */
__m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
__m128i cb1 = LOAD_SI128(cb_buf);
cb_buf += step;
/* cr = cr_b_buf[i]; */
__m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
__m128i cr1 = LOAD_SI128(cr_buf);
cr_buf += step;
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
@@ -317,15 +317,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
b1 = _mm_srai_epi16(b1, 3);
/* b_buf[i] = CLIP(b); */
mm_between_epi16(b1, zero, max);
__m128i y2 = _mm_load_si128((const __m128i*)y_buf);
__m128i y2 = LOAD_SI128(y_buf);
y_buf += step;
y2 = _mm_add_epi16(y2, c4096);
y2 = _mm_srai_epi16(y2, 2);
/* cb = cb_g_buf[i]; */
__m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
__m128i cb2 = LOAD_SI128(cb_buf);
cb_buf += step;
/* cr = cr_b_buf[i]; */
__m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
__m128i cr2 = LOAD_SI128(cr_buf);
cr_buf += step;
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
@@ -369,13 +369,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
R2 = R3; /* R2 = R3 */
R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = B5G5R5FFB4G4R4FF */
R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = B7G7R7FFB6G6R6FF */
_mm_store_si128((__m128i*)d_buf, R0); /* B1G1R1FFB0G0R0FF */
STORE_SI128(d_buf, R0); /* B1G1R1FFB0G0R0FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R4); /* B3G3R3FFB2G2R2FF */
STORE_SI128(d_buf, R4); /* B3G3R3FFB2G2R2FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R2); /* B5G5R5FFB4G4R4FF */
STORE_SI128(d_buf, R2); /* B5G5R5FFB4G4R4FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R3); /* B7G7R7FFB6G6R6FF */
STORE_SI128(d_buf, R3); /* B7G7R7FFB6G6R6FF */
d_buf += sizeof(__m128i);
}
}
@@ -476,15 +476,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y1 = _mm_load_si128((const __m128i*)y_buf);
__m128i y1 = LOAD_SI128(y_buf);
y_buf += step;
y1 = _mm_add_epi16(y1, c4096);
y1 = _mm_srai_epi16(y1, 2);
/* cb = cb_g_buf[i]; */
__m128i cb1 = _mm_load_si128((const __m128i*)cb_buf);
__m128i cb1 = LOAD_SI128(cb_buf);
cb_buf += step;
/* cr = cr_b_buf[i]; */
__m128i cr1 = _mm_load_si128((const __m128i*)cr_buf);
__m128i cr1 = LOAD_SI128(cr_buf);
cr_buf += step;
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r1 = _mm_add_epi16(y1, _mm_mulhi_epi16(cr1, r_cr));
@@ -502,15 +502,15 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
b1 = _mm_srai_epi16(b1, 3);
/* b_buf[i] = CLIP(b); */
mm_between_epi16(b1, zero, max);
__m128i y2 = _mm_load_si128((const __m128i*)y_buf);
__m128i y2 = LOAD_SI128(y_buf);
y_buf += step;
y2 = _mm_add_epi16(y2, c4096);
y2 = _mm_srai_epi16(y2, 2);
/* cb = cb_g_buf[i]; */
__m128i cb2 = _mm_load_si128((const __m128i*)cb_buf);
__m128i cb2 = LOAD_SI128(cb_buf);
cb_buf += step;
/* cr = cr_b_buf[i]; */
__m128i cr2 = _mm_load_si128((const __m128i*)cr_buf);
__m128i cr2 = LOAD_SI128(cr_buf);
cr_buf += step;
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r2 = _mm_add_epi16(y2, _mm_mulhi_epi16(cr2, r_cr));
@@ -554,13 +554,13 @@ sse2_yCbCrToRGB_16s8u_P3AC4R_RGBX(const INT16* WINPR_RESTRICT pSrc[3], UINT32 sr
R2 = R3; /* R2 = R3 */
R2 = _mm_unpacklo_epi16(R1, R2); /* R2 = R5G5B5FFR4G4B4FF */
R3 = _mm_unpackhi_epi16(R1, R3); /* R3 = R7G7B7FFR6G6B6FF */
_mm_store_si128((__m128i*)d_buf, R0); /* R1G1B1FFR0G0B0FF */
STORE_SI128(d_buf, R0); /* R1G1B1FFR0G0B0FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R4); /* R3G3B3FFR2G2B2FF */
STORE_SI128(d_buf, R4); /* R3G3B3FFR2G2B2FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R2); /* R5G5B5FFR4G4B4FF */
STORE_SI128(d_buf, R2); /* R5G5B5FFR4G4B4FF */
d_buf += sizeof(__m128i);
_mm_store_si128((__m128i*)d_buf, R3); /* R7G7B7FFR6G6B6FF */
STORE_SI128(d_buf, R3); /* R7G7B7FFR6G6B6FF */
d_buf += sizeof(__m128i);
}
}
@@ -694,9 +694,9 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
* within the upper 16 bits we will also have to scale the RGB
* values used in the multiplication by << 5+(16-n).
*/
__m128i r = _mm_load_si128(r_buf + i);
__m128i g = _mm_load_si128(g_buf + i);
__m128i b = _mm_load_si128(b_buf + i);
__m128i r = LOAD_SI128(r_buf + i);
__m128i g = LOAD_SI128(g_buf + i);
__m128i b = LOAD_SI128(b_buf + i);
/* r<<6; g<<6; b<<6 */
r = _mm_slli_epi16(r, 6);
g = _mm_slli_epi16(g, 6);
@@ -708,21 +708,21 @@ sse2_RGBToYCbCr_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
y = _mm_add_epi16(y, min);
/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
mm_between_epi16(y, min, max);
_mm_store_si128(y_buf + i, y);
STORE_SI128(y_buf + i, y);
/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
__m128i cb = _mm_mulhi_epi16(r, cb_r);
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
mm_between_epi16(cb, min, max);
_mm_store_si128(cb_buf + i, cb);
STORE_SI128(cb_buf + i, cb);
/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
__m128i cr = _mm_mulhi_epi16(r, cr_r);
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
mm_between_epi16(cr, min, max);
_mm_store_si128(cr_buf + i, cr);
STORE_SI128(cr_buf + i, cr);
}
y_buf += srcbump;
@@ -769,27 +769,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pb);
R0 = LOAD_SI128(pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_load_si128((const __m128i*)pb);
R1 = LOAD_SI128(pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pg);
R0 = LOAD_SI128(pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_load_si128((const __m128i*)pg);
R1 = LOAD_SI128(pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pr);
R0 = LOAD_SI128(pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_load_si128((const __m128i*)pr);
R1 = LOAD_SI128(pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@@ -801,22 +801,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_BGRX(
{
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR1G1B1FFR0G0B0 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR3G3B3FFR2G2B2 */
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR5G5B5FFR4G4B4 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR7G7B7FFR6G6B6 */
}
}
@@ -875,27 +875,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pb);
R0 = LOAD_SI128(pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_load_si128((const __m128i*)pb);
R1 = LOAD_SI128(pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pg);
R0 = LOAD_SI128(pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_load_si128((const __m128i*)pg);
R1 = LOAD_SI128(pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pr);
R0 = LOAD_SI128(pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_load_si128((const __m128i*)pr);
R1 = LOAD_SI128(pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@@ -912,22 +912,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_RGBX(
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR1G1B1FFR0G0B0 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR3G3B3FFR2G2B2 */
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR5G5B5FFR4G4B4 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR7G7B7FFR6G6B6 */
}
}
@@ -986,27 +986,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pb);
R0 = LOAD_SI128(pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_load_si128((const __m128i*)pb);
R1 = LOAD_SI128(pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pg);
R0 = LOAD_SI128(pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_load_si128((const __m128i*)pg);
R1 = LOAD_SI128(pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pr);
R0 = LOAD_SI128(pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_load_si128((const __m128i*)pr);
R1 = LOAD_SI128(pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@@ -1023,22 +1023,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XBGR(
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR1G1B1FFR0G0B0 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR3G3B3FFR2G2B2 */
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR5G5B5FFR4G4B4 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR7G7B7FFR6G6B6 */
}
}
@@ -1097,27 +1097,27 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pb);
R0 = LOAD_SI128(pb);
pb += 8; /* R0 = 00B300B200B100B0 */
R1 = _mm_load_si128((const __m128i*)pb);
R1 = LOAD_SI128(pb);
pb += 8; /* R1 = 00B700B600B500B4 */
b = _mm_packus_epi16(R0, R1); /* b = B7B6B5B4B3B2B1B0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pg);
R0 = LOAD_SI128(pg);
pg += 8; /* R1 = 00G300G200G100G0 */
R1 = _mm_load_si128((const __m128i*)pg);
R1 = LOAD_SI128(pg);
pg += 8; /* R2 = 00G700G600G500G4 */
g = _mm_packus_epi16(R0, R1); /* g = G7G6G5G4G3G2G1G0 */
}
{
__m128i R0;
__m128i R1;
R0 = _mm_load_si128((const __m128i*)pr);
R0 = LOAD_SI128(pr);
pr += 8; /* R0 = 00R300R200R100R0 */
R1 = _mm_load_si128((const __m128i*)pr);
R1 = LOAD_SI128(pr);
pr += 8; /* R3 = 00R700R600R500R4 */
r = _mm_packus_epi16(R0, R1); /* r = R7R6R5R4R3R2R1R0 */
}
@@ -1134,22 +1134,22 @@ static pstatus_t sse2_RGBToRGB_16s8u_P3AC4R_XRGB(
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR1G1B1FFR0G0B0 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbLo, arLo);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR3G3B3FFR2G2B2 */
}
{
const __m128i bgrx = _mm_unpacklo_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR5G5B5FFR4G4B4 */
}
{
const __m128i bgrx = _mm_unpackhi_epi16(gbHi, arHi);
_mm_store_si128((__m128i*)out, bgrx);
STORE_SI128(out, bgrx);
out += 16; /* FFR7G7B7FFR6G6B6 */
}
}
@@ -1217,7 +1217,8 @@ void primitives_init_colors_sse2(primitives_t* prims)
generic = primitives_get_generic();
primitives_init_colors(prims);
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
WLog_VRB(PRIM_TAG, "SSE2 optimizations");
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;

View File

@@ -22,6 +22,7 @@
#include <freerdp/primitives.h>
#include <freerdp/log.h>
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_copy.h"
#include "../codec/color.h"
@@ -68,12 +69,12 @@ static INLINE pstatus_t sse_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstDat
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = _mm_shuffle_epi8(s0, smask);
const __m128i s2 = _mm_loadu_si128(dst);
const __m128i s2 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s2, mask);
_mm_storeu_si128(dst, d0);
STORE_SI128(dst, d0);
}
}
for (; x < nWidth; x++)
@@ -118,10 +119,10 @@ static INLINE pstatus_t sse_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstDa
{
const __m128i* src = (const __m128i*)&srcLine[(x + nXSrc) * srcByte];
__m128i* dst = (__m128i*)&dstLine[(x + nXDst) * dstByte];
const __m128i s0 = _mm_loadu_si128(src);
const __m128i s1 = _mm_loadu_si128(dst);
const __m128i s0 = LOAD_SI128(src);
const __m128i s1 = LOAD_SI128(dst);
__m128i d0 = _mm_blendv_epi8(s1, s0, mask);
_mm_storeu_si128(dst, d0);
STORE_SI128(dst, d0);
}
for (; x < nWidth; x++)

View File

@@ -21,7 +21,8 @@
#include <freerdp/primitives.h>
#include <winpr/sysinfo.h>
#include "prim_avxsse.h".h "
#include "prim_internal.h"
#include "prim_avxsse.h"
#include "prim_set.h"
/* ========================================================================= */
@@ -60,37 +61,37 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
}
@@ -101,7 +102,7 @@ static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 16;
}
@@ -152,37 +153,37 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le
/* Do 256-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
}
@@ -193,7 +194,7 @@ static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 le
/* Do 16-byte chunks using one XMM register. */
while (count--)
{
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 4;
}
@@ -220,7 +221,8 @@ void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
primitives_init_set(prims);
/* Pick tuned versions if possible. */
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
{
WLog_VRB(PRIM_TAG, "SSE2 optimizations");
prims->set_8u = sse2_set_8u;

View File

@@ -79,14 +79,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
{
const __m128i* src = (const __m128i*)pSrcDst;
__m128i xmm0 = _mm_load_si128(src++);
__m128i xmm1 = _mm_load_si128(src++);
__m128i xmm2 = _mm_load_si128(src++);
__m128i xmm3 = _mm_load_si128(src++);
__m128i xmm4 = _mm_load_si128(src++);
__m128i xmm5 = _mm_load_si128(src++);
__m128i xmm6 = _mm_load_si128(src++);
__m128i xmm7 = _mm_load_si128(src);
__m128i xmm0 = LOAD_SI128(src++);
__m128i xmm1 = LOAD_SI128(src++);
__m128i xmm2 = LOAD_SI128(src++);
__m128i xmm3 = LOAD_SI128(src++);
__m128i xmm4 = LOAD_SI128(src++);
__m128i xmm5 = LOAD_SI128(src++);
__m128i xmm6 = LOAD_SI128(src++);
__m128i xmm7 = LOAD_SI128(src);
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
@@ -99,14 +99,14 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
_mm_store_si128(dst++, xmm1);
_mm_store_si128(dst++, xmm2);
_mm_store_si128(dst++, xmm3);
_mm_store_si128(dst++, xmm4);
_mm_store_si128(dst++, xmm5);
_mm_store_si128(dst++, xmm6);
_mm_store_si128(dst++, xmm7);
STORE_SI128(dst++, xmm0);
STORE_SI128(dst++, xmm1);
STORE_SI128(dst++, xmm2);
STORE_SI128(dst++, xmm3);
STORE_SI128(dst++, xmm4);
STORE_SI128(dst++, xmm5);
STORE_SI128(dst++, xmm6);
STORE_SI128(dst++, xmm7);
pSrcDst = (INT16*)dst;
}
@@ -122,7 +122,7 @@ static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32
xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
__m128i* dst = (__m128i*)pSrcDst;
_mm_store_si128(dst++, xmm0);
STORE_SI128(dst++, xmm0);
pSrcDst = (INT16*)dst;
}

View File

@@ -21,6 +21,7 @@
#include "prim_sign.h"
#include "prim_internal.h"
#include "prim_avxsse.h"
#if defined(SSE_AVX_INTRINSICS_ENABLED)
@@ -79,25 +80,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm1);
STORE_SI128(dptr, xmm1);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm2);
STORE_SI128(dptr, xmm2);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm3);
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
@@ -118,25 +119,25 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
xmm1 = _mm_set1_epi16(0x0001U);
xmm2 = _mm_set1_epi16(0x0001U);
xmm3 = _mm_set1_epi16(0x0001U);
xmm4 = _mm_load_si128((const __m128i*)sptr);
xmm4 = LOAD_SI128(sptr);
sptr += 8;
xmm5 = _mm_load_si128((const __m128i*)sptr);
xmm5 = LOAD_SI128(sptr);
sptr += 8;
xmm6 = _mm_load_si128((const __m128i*)sptr);
xmm6 = LOAD_SI128(sptr);
sptr += 8;
xmm7 = _mm_load_si128((const __m128i*)sptr);
xmm7 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm4);
xmm1 = _mm_sign_epi16(xmm1, xmm5);
xmm2 = _mm_sign_epi16(xmm2, xmm6);
xmm3 = _mm_sign_epi16(xmm3, xmm7);
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm1);
STORE_SI128(dptr, xmm1);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm2);
STORE_SI128(dptr, xmm2);
dptr += 8;
_mm_store_si128((__m128i*)dptr, xmm3);
STORE_SI128(dptr, xmm3);
dptr += 8;
}
}
@@ -151,7 +152,7 @@ static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_R
__m128i xmm1 = LOAD_SI128(sptr);
sptr += 8;
xmm0 = _mm_sign_epi16(xmm0, xmm1);
_mm_store_si128((__m128i*)dptr, xmm0);
STORE_SI128(dptr, xmm0);
dptr += 8;
}