From decd5193e1d409a4d8dd4c55ca46467d51c86976 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet Date: Tue, 10 Mar 2015 23:11:53 +0000 Subject: [PATCH] x86: xvid_idct: merged idct_put SSE2 versions Signed-off-by: Michael Niedermayer --- libavcodec/x86/xvididct.asm | 202 ++++++++++++++++++++++----------- libavcodec/x86/xvididct_init.c | 8 +- 2 files changed, 140 insertions(+), 70 deletions(-) diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm index 4c52bf159e..58ffb11806 100644 --- a/libavcodec/x86/xvididct.asm +++ b/libavcodec/x86/xvididct.asm @@ -292,13 +292,13 @@ SECTION .text %define TAN3 xmm13 %define TAN1 xmm14 %else -%define ROW0 [r0 + 0*16] +%define ROW0 [BLOCK + 0*16] %define REG0 xmm4 -%define ROW2 [r0 + 2*16] +%define ROW2 [BLOCK + 2*16] %define REG2 xmm4 -%define ROW4 [r0 + 4*16] +%define ROW4 [BLOCK + 4*16] %define REG4 xmm6 -%define ROW6 [r0 + 6*16] +%define ROW6 [BLOCK + 6*16] %define REG6 xmm6 %define XMMS xmm2 %define SREG2 xmm7 @@ -369,8 +369,71 @@ SECTION .text movdqa TAN1, [tan1] %endmacro +%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put) + psraw xmm5, 6 + psraw REG0, 6 + psraw TAN3, 6 + psraw xmm3, 6 + ; dct coeffs must still be written for AC prediction +%if %2 == 0 + movdqa [%1+1*16], TAN3 + movdqa [%1+2*16], xmm3 + movdqa [%1+5*16], REG0 + movdqa [%1+6*16], xmm5 +%else + ; Must now load args as gprs are no longer used for masks + ; DEST is set to where address of dest was loaded + %if ARCH_X86_32 + %xdefine DEST r2q ; BLOCK is r0, stride r1 + movifnidn DEST, destm + movifnidn strideq, stridem + %else + %xdefine DEST r0q + %endif + lea r3q, [3*strideq] + %if %2 == 1 + packuswb TAN3, xmm3 + packuswb xmm5, REG0 + movq [DEST + strideq], TAN3 + movhps [DEST + 2*strideq], TAN3 + ; REG0 and TAN3 are now available (and likely used in second half) + %else + %warning Unimplemented + %endif +%endif +%endmacro + +%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms + psraw %3, 6 + psraw %4, 6 + psraw %5, 6 + psraw %6, 6 + ; dct coeffs must still be written for AC prediction +%if %2 == 0 + movdqa [%1+0*16], %3 + movdqa [%1+3*16], %5 + movdqa [%1+4*16], %6 + movdqa [%1+7*16], %4 +%elif %2 == 1 + packuswb %3, %5 + packuswb %6, %4 + ; address of dest may have been loaded + movq [DEST], %3 + movhps [DEST + r3q], %3 + lea DEST, [DEST + 4*strideq] + movq [DEST], %6 + movhps [DEST + r3q], %6 + ; and now write remainder of first half + movq [DEST + 2*strideq], xmm5 + movhps [DEST + strideq], xmm5 +%elif %2 == 2 +%warning Unimplemented +%endif +%endmacro + + ; IDCT pass on columns. -%macro iLLM_PASS 1 ;dct +%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put) movdqa xmm1, TAN3 movdqa xmm3, TAN1 pmulhw TAN3, xmm4 @@ -407,7 +470,7 @@ SECTION .text psubsw xmm5, REG6 MOV32 ROW0, REG0 MOV32 ROW4, REG4 - MOV32 TAN1, [r0] + MOV32 TAN1, [BLOCK] movdqa XMMS, REG0 psubsw REG0, REG4 paddsw REG4, XMMS @@ -423,33 +486,22 @@ SECTION .text movdqa XMMS, REG0 psubsw REG0, xmm3 paddsw xmm3, XMMS - MOV32 [r0], TAN1 - psraw xmm5, 6 - psraw REG0, 6 - psraw TAN3, 6 - psraw xmm3, 6 - movdqa [%1+1*16], TAN3 - movdqa [%1+2*16], xmm3 - movdqa [%1+5*16], REG0 - movdqa [%1+6*16], xmm5 + MOV32 [BLOCK], TAN1 + + FIRST_HALF %1, %2 + movdqa xmm0, xmm7 movdqa xmm4, REG4 psubsw xmm7, xmm1 psubsw REG4, TAN1 paddsw xmm1, xmm0 paddsw TAN1, xmm4 - psraw xmm1, 6 - psraw xmm7, 6 - psraw TAN1, 6 - psraw REG4, 6 - movdqa [%1+0*16], xmm1 - movdqa [%1+3*16], TAN1 - movdqa [%1+4*16], REG4 - movdqa [%1+7*16], xmm7 + + SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4 %endmacro ; IDCT pass on columns, assuming rows 4-7 are zero -%macro iLLM_PASS_SPARSE 1 ;dct +%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add) pmulhw TAN3, xmm4 paddsw TAN3, xmm4 movdqa xmm3, xmm6 @@ -475,7 +527,7 @@ SECTION .text movdqa xmm6, REG0 psubsw xmm6, SREG2 paddsw SREG2, REG0 - MOV32 TAN1, [r0] + MOV32 TAN1, [BLOCK] movdqa XMMS, REG0 psubsw REG0, xmm5 paddsw xmm5, XMMS @@ -485,70 +537,92 @@ SECTION .text movdqa XMMS, REG0 psubsw REG0, xmm3 paddsw xmm3, XMMS - MOV32 [r0], TAN1 - psraw xmm5, 6 - psraw REG0, 6 - psraw TAN3, 6 - psraw xmm3, 6 - movdqa [%1+1*16], TAN3 - movdqa [%1+2*16], xmm3 - movdqa [%1+5*16], REG0 - movdqa [%1+6*16], xmm5 + MOV32 [BLOCK], TAN1 + + FIRST_HALF %1, %2 + movdqa xmm0, SREG2 movdqa xmm4, xmm6 psubsw SREG2, xmm1 psubsw xmm6, TAN1 paddsw xmm1, xmm0 paddsw TAN1, xmm4 - psraw xmm1, 6 - psraw SREG2, 6 - psraw TAN1, 6 - psraw xmm6, 6 - movdqa [%1+0*16], xmm1 - movdqa [%1+3*16], TAN1 - movdqa [%1+4*16], xmm6 - movdqa [%1+7*16], SREG2 + + SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6 %endmacro -INIT_XMM sse2 -cglobal xvid_idct, 1, 5, 8+7*ARCH_X86_64, block +%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add +%if %1 == 0 || ARCH_X86_32 + %define GPR0 r1d + %define GPR1 r2d + %define GPR2 r3d + %define GPR3 r4d + %define NUM_GPRS 5 +%else + %define GPR0 r3d + %define GPR1 r4d + %define GPR2 r5d + %define GPR3 r6d + %define NUM_GPRS 7 +%endif +%if %1 == 0 +cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block +%xdefine BLOCK blockq +%else + %if %1 == 1 +cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block + %else +cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block + %endif + %if ARCH_X86_64 + %xdefine BLOCK blockq + %else + mov r0q, blockm + %xdefine BLOCK r0q + %endif +%endif movq mm0, [pb_127] - iMTX_MULT r0 + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 - iMTX_MULT r0 + 1*16, iTab2, PUT_ODD, ROW1, 1*16 - iMTX_MULT r0 + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 + iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16 + iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16 + iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16 - TEST_TWO_ROWS r0 + 3*16, r0 + 4*16, r1d, r2d, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c - JZ r1d, col1 - iMTX_MULT r0 + 3*16, iTab4, PUT_ODD, ROW3, 3*16 + TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c + JZ GPR0, col1 + iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16 .col1: - TEST_TWO_ROWS r0 + 5*16, r0 + 6*16, r1d, r3d, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d - TEST_ONE_ROW r0 + 7*16, r4d, CLEAR_ODD, ROW7 ; esi + TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d + TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi iLLM_HEAD - JNZ r2d, 2 - JNZ r1d, 3 - JNZ r3d, 4 - JNZ r4d, 5 - iLLM_PASS_SPARSE r0 + JNZ GPR1, 2 + JNZ GPR0, 3 + JNZ GPR2, 4 + JNZ GPR3, 5 + iLLM_PASS_SPARSE BLOCK, %1 jmp .6 .2: - iMTX_MULT r0 + 4*16, iTab1, PUT_EVEN, ROW4 + iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4 .3: - iMTX_MULT r0 + 5*16, iTab4, PUT_ODD, ROW5, 4*16 - JZ r3d, col2 + iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16 + JZ GPR2, col2 .4: - iMTX_MULT r0 + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 + iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16 .col2: - JZ r4d, col3 + JZ GPR3, col3 .5: - iMTX_MULT r0 + 7*16, iTab2, PUT_ODD, ROW7, 5*16 + iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16 .col3: %if ARCH_X86_32 iLLM_HEAD %endif - iLLM_PASS r0 + iLLM_PASS BLOCK, %1 .6: RET +%endmacro + +INIT_XMM sse2 +IDCT_SSE2 0 +IDCT_SSE2 1 %if ARCH_X86_32 diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c index b429032ce1..2530d7aad2 100644 --- a/libavcodec/x86/xvididct_init.c +++ b/libavcodec/x86/xvididct_init.c @@ -26,11 +26,7 @@ #include "idctdsp.h" #include "xvididct.h" -static void xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block) -{ - ff_xvid_idct_sse2(block); - ff_put_pixels_clamped(block, dest, line_size); -} +void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block); static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block) { @@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, #endif if (EXTERNAL_SSE2(cpu_flags)) { - c->idct_put = xvid_idct_sse2_put; + c->idct_put = ff_xvid_idct_put_sse2; c->idct_add = xvid_idct_sse2_add; c->idct = ff_xvid_idct_sse2; c->perm_type = FF_IDCT_PERM_SSE2;