From 2f0591cfa3b773d7a2fec72b30ec25d4ffb0cb32 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Tue, 4 Apr 2017 12:17:08 -0400 Subject: [PATCH] cavs: add a sse2 idct implementation. This makes using the function pointer ff_add_pixels_clamped() unnecessary, since we always know what the best implementation is at compile-time. --- libavcodec/x86/cavsdsp.c | 15 +++++++++++- libavcodec/x86/cavsidct.asm | 48 ++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index add4536783..a8a198b46d 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -29,6 +29,7 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/cavsdsp.h" #include "libavcodec/idctdsp.h" +#include "libavcodec/x86/idctdsp.h" #include "constants.h" #include "fpel.h" #include "idctdsp.h" @@ -43,7 +44,16 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, ptrdiff_t stride) { LOCAL_ALIGNED(16, int16_t, b2, [64]); ff_cavs_idct8_mmx(b2, block); - ff_add_pixels_clamped(b2, dst, stride); + ff_add_pixels_clamped_mmx(b2, dst, stride); +} + +void ff_cavs_idct8_sse2(int16_t *out, const int16_t *in); + +static void cavs_idct8_add_sse2(uint8_t *dst, int16_t *block, ptrdiff_t stride) +{ + LOCAL_ALIGNED(16, int16_t, b2, [64]); + ff_cavs_idct8_sse2(b2, block); + ff_add_pixels_clamped_sse2(b2, dst, stride); } #endif /* HAVE_MMX_EXTERNAL */ @@ -446,6 +456,9 @@ av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) if (EXTERNAL_SSE2(cpu_flags)) { c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_sse2; c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_sse2; + + c->cavs_idct8_add = cavs_idct8_add_sse2; + c->idct_perm = FF_IDCT_PERM_TRANSPOSE; } #endif } diff --git a/libavcodec/x86/cavsidct.asm b/libavcodec/x86/cavsidct.asm index 5421196e1b..6c768c2646 100644 --- a/libavcodec/x86/cavsidct.asm +++ b/libavcodec/x86/cavsidct.asm @@ -29,11 +29,16 @@ cextern pw_64 SECTION .text -%macro CAVS_IDCT8_1D 2 ; source, round +%macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load +%if %3 == 1 mova m4, [%1+7*16] ; m4 = src7 mova m5, [%1+1*16] ; m5 = src1 mova m2, [%1+5*16] ; m2 = src5 mova m7, [%1+3*16] ; m7 = src3 +%else + SWAP 1, 7 + SWAP 4, 6 +%endif mova m0, m4 mova m3, m5 mova m6, m2 @@ -163,3 +168,44 @@ cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp jg .loop_2 RET + +INIT_XMM sse2 +cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in + CAVS_IDCT8_1D inq, [pw_4] + psraw m7, 3 + psraw m6, 3 + psraw m5, 3 + psraw m4, 3 + psraw m3, 3 + psraw m2, 3 + psraw m1, 3 + psraw m0, 3 +%if ARCH_X86_64 + TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, 8 + mova [rsp+4*16], m0 +%else + mova [rsp+0*16], m4 + TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1 +%endif + mova [rsp+0*16], m7 + mova [rsp+2*16], m3 + mova [rsp+6*16], m4 + CAVS_IDCT8_1D rsp, [pw_64], 0 + psraw m7, 7 + psraw m6, 7 + psraw m5, 7 + psraw m4, 7 + psraw m3, 7 + psraw m2, 7 + psraw m1, 7 + psraw m0, 7 + + mova [outq+0*16], m7 + mova [outq+1*16], m5 + mova [outq+2*16], m3 + mova [outq+3*16], m1 + mova [outq+4*16], m0 + mova [outq+5*16], m2 + mova [outq+6*16], m4 + mova [outq+7*16], m6 + RET