From 15ce160183c61fbd98915e07879f1225fcf1c080 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Tue, 10 Mar 2015 23:11:54 +0000 Subject: [PATCH] x86: xvid_idct: SSE2 merged add version Signed-off-by: Michael Niedermayer --- libavcodec/x86/xvididct.asm | 92 ++++++++++++++++++++++++++++++++-- libavcodec/x86/xvididct_init.c | 9 +--- 2 files changed, 91 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm index 58ffb11806..0220885da6 100644 --- a/libavcodec/x86/xvididct.asm +++ b/libavcodec/x86/xvididct.asm @@ -384,6 +384,12 @@ SECTION .text ; Must now load args as gprs are no longer used for masks ; DEST is set to where address of dest was loaded %if ARCH_X86_32 + %if %2 == 2 ; Not enough xmms, store + movdqa [%1+1*16], TAN3 + movdqa [%1+2*16], xmm3 + movdqa [%1+5*16], REG0 + movdqa [%1+6*16], xmm5 + %endif %xdefine DEST r2q ; BLOCK is r0, stride r1 movifnidn DEST, destm movifnidn strideq, stridem @@ -397,8 +403,6 @@ SECTION .text movq [DEST + strideq], TAN3 movhps [DEST + 2*strideq], TAN3 ; REG0 and TAN3 are now available (and likely used in second half) - %else - %warning Unimplemented %endif %endif %endmacro @@ -427,7 +431,88 @@ SECTION .text movq [DEST + 2*strideq], xmm5 movhps [DEST + strideq], xmm5 %elif %2 == 2 -%warning Unimplemented + pxor xmm0, xmm0 + %if ARCH_X86_32 + ; free: m3 REG0=m4 m5 + ; input: m1, m7, m2, m6 + movq xmm3, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + paddsw xmm3, %3 + paddsw xmm4, [%1 + 1*16] + movq %3, [DEST+2*strideq] + movq xmm5, [DEST+ r3q] + punpcklbw %3, xmm0 + punpcklbw xmm5, xmm0 + paddsw %3, [%1 + 2*16] + paddsw xmm5, %5 + packuswb xmm3, xmm4 + packuswb %3, xmm5 + movq [DEST+0*strideq], xmm3 + movhps [DEST+1*strideq], xmm3 + movq [DEST+2*strideq], %3 + movhps [DEST+ r3q], %3 + lea DEST, [DEST+4*strideq] + movq xmm3, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq %3, [DEST+2*strideq] + movq xmm5, [DEST+ r3q] + punpcklbw xmm3, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw %3, xmm0 + punpcklbw xmm5, xmm0 + paddsw xmm3, %6 + paddsw xmm4, [%1 + 5*16] + paddsw %3, [%1 + 6*16] + paddsw xmm5, %4 + packuswb xmm3, xmm4 + packuswb %3, xmm5 + movq [DEST+0*strideq], xmm3 + movhps [DEST+1*strideq], xmm3 + movq [DEST+2*strideq], %3 + movhps [DEST+ r3q], %3 + %else + ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5 + ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10 + movq xmm2, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq xmm12, [DEST+2*strideq] + movq xmm11, [DEST+ r3q] + punpcklbw xmm2, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw xmm12, xmm0 + punpcklbw xmm11, xmm0 + paddsw xmm2, %3 + paddsw xmm4, TAN3 + paddsw xmm12, xmm3 + paddsw xmm11, %5 + packuswb xmm2, xmm4 + packuswb xmm12, xmm11 + movq [DEST+0*strideq], xmm2 + movhps [DEST+1*strideq], xmm2 + movq [DEST+2*strideq], xmm12 + movhps [DEST+ r3q], xmm12 + lea DEST, [DEST+4*strideq] + movq xmm2, [DEST+0*strideq] + movq xmm4, [DEST+1*strideq] + movq xmm12, [DEST+2*strideq] + movq xmm11, [DEST+ r3q] + punpcklbw xmm2, xmm0 + punpcklbw xmm4, xmm0 + punpcklbw xmm12, xmm0 + punpcklbw xmm11, xmm0 + paddsw xmm2, %6 + paddsw xmm4, REG0 + paddsw xmm12, xmm5 + paddsw xmm11, %4 + packuswb xmm2, xmm4 + packuswb xmm12, xmm11 + movq [DEST+0*strideq], xmm2 + movhps [DEST+1*strideq], xmm2 + movq [DEST+2*strideq], xmm12 + movhps [DEST+ r3q], xmm12 + %endif %endif %endmacro @@ -623,6 +708,7 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block INIT_XMM sse2 IDCT_SSE2 0 IDCT_SSE2 1 +IDCT_SSE2 2 %if ARCH_X86_32 diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c index 2530d7aad2..57f6ed6dc3 100644 --- a/libavcodec/x86/xvididct_init.c +++ b/libavcodec/x86/xvididct_init.c @@ -27,12 +27,7 @@ #include "xvididct.h" void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block); - -static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) -{ - ff_xvid_idct_sse2(block); - ff_add_pixels_clamped(block, dest, line_size); -} +void ff_xvid_idct_add_sse2(uint8_t *dest, int line_size, short *block); #if ARCH_X86_32 static void xvid_idct_mmx_put(uint8_t *dest, int line_size, short *block) @@ -88,7 +83,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, if (EXTERNAL_SSE2(cpu_flags)) { c->idct_put = ff_xvid_idct_put_sse2; - c->idct_add = xvid_idct_sse2_add; + c->idct_add = ff_xvid_idct_add_sse2; c->idct = ff_xvid_idct_sse2; c->perm_type = FF_IDCT_PERM_SSE2; }