From c8467abbadab424757ea23f71a1036abfb7f14b4 Mon Sep 17 00:00:00 2001 From: James Almer Date: Thu, 2 Feb 2017 17:51:21 -0300 Subject: [PATCH] x86/rv34dsp: add ff_rv34_idct_dc_add_sse2 Also disable ff_rv34_idct_dc_add_mmx on x86_64 as the presence of sse2 is guaranteed in such builds. Signed-off-by: James Almer --- libavcodec/x86/rv34dsp.asm | 19 ++++++++++++++++++- libavcodec/x86/rv34dsp_init.c | 5 ++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 7732d65b2a..692b4acfcd 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -64,6 +64,7 @@ rv34_idct dc rv34_idct dc_noround ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); +%if ARCH_X86_32 INIT_MMX mmx cglobal rv34_idct_dc_add, 3, 3 ; calculate DC @@ -97,6 +98,7 @@ cglobal rv34_idct_dc_add, 3, 3 movh [r2], m4 movh [r2+r1], m5 RET +%endif ; Load coeffs and perform row transform ; Output: coeffs in mm[0467], rounder in mm5 @@ -167,7 +169,7 @@ cglobal rv34_idct_add, 3,3,0, d, s, b ret ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); -INIT_XMM sse4 +%macro RV34_IDCT_DC_ADD 0 cglobal rv34_idct_dc_add, 3, 3, 6 ; load data IDCT_DC_ROUND r2 @@ -190,7 +192,22 @@ cglobal rv34_idct_dc_add, 3, 3, 6 paddw m4, m0 packuswb m2, m4 movd [r0], m2 +%if cpuflag(sse4) pextrd [r0+r1], m2, 1 pextrd [r2], m2, 2 pextrd [r2+r1], m2, 3 +%else + psrldq m2, 4 + movd [r0+r1], m2 + psrldq m2, 4 + movd [r2], m2 + psrldq m2, 4 + movd [r2+r1], m2 +%endif RET +%endmacro + +INIT_XMM sse2 +RV34_IDCT_DC_ADD +INIT_XMM sse4 +RV34_IDCT_DC_ADD diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c index d417dac45b..7310122458 100644 --- a/libavcodec/x86/rv34dsp_init.c +++ b/libavcodec/x86/rv34dsp_init.c @@ -27,6 +27,7 @@ void ff_rv34_idct_dc_mmxext(int16_t *block); void ff_rv34_idct_dc_noround_mmxext(int16_t *block); void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc); +void ff_rv34_idct_dc_add_sse2(uint8_t *dst, ptrdiff_t stride, int dc); void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); @@ -34,12 +35,14 @@ av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) + if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; if (EXTERNAL_MMXEXT(cpu_flags)) { c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext; c->rv34_idct_add = ff_rv34_idct_add_mmxext; } + if (EXTERNAL_SSE2(cpu_flags)) + c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse2; if (EXTERNAL_SSE4(cpu_flags)) c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; }