diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c index 9eec10a0a7..4958ff794b 100644 --- a/libavcodec/x86/vp8dsp-init.c +++ b/libavcodec/x86/vp8dsp-init.c @@ -196,6 +196,7 @@ HVBILIN(ssse3, 8, 16, 16) extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride); extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]); +extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); #endif #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ @@ -229,6 +230,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) #if HAVE_YASM if (mm_flags & FF_MM_MMX) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; + c->vp8_idct_add = ff_vp8_idct_add_mmx; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; c->put_vp8_epel_pixels_tab[1][0][0] = diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 118d07196e..22947763f9 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -142,6 +142,9 @@ filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +pw_20091: times 4 dw 20091 +pw_17734: times 4 dw 17734 + cextern pw_3 cextern pw_4 cextern pw_64 @@ -923,6 +926,92 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6 pextrd [r1+r2], xmm2, 3 RET +;----------------------------------------------------------------------------- +; void vp8_idct_add_(uint8_t *dst, DCTELEM block[16], int stride); +;----------------------------------------------------------------------------- + +; calculate %1=%2+%1; %2=%2-%1, with %3=temp register +%macro SUMSUB 3 + mova %3, %1 + paddw %1, %2 + psubw %2, %3 +%endmacro + +; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) +; this macro assumes that m6/m7 have words for 20091/17734 loaded +%macro VP8_MULTIPLY_SUMSUB 4 + mova %3, %1 + mova %4, %2 + pmulhw %3, m6 ;20091(1) + pmulhw %4, m6 ;20091(2) + paddw %3, %1 + paddw %4, %2 + psllw %1, 1 + psllw %2, 1 + pmulhw %1, m7 ;35468(1) + pmulhw %2, m7 ;35468(2) + psubw %1, %4 + paddw %2, %3 +%endmacro + +; calculate x0=%1+%3; x1=%1-%3 +; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) +; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) +; %5/%6 are temporary registers +; we assume m6/m7 have constant words 20091/17734 loaded in them +%macro VP8_IDCT_TRANSFORM4x4_1D 6 + SUMSUB_BA m%3, m%1, m%5 ;t0, t1 + VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 + SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 + SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 + SWAP %4, %1 + SWAP %4, %3 +%endmacro + +; transpose a 4x4 table +%macro TRANSPOSE4x4 5 ; output in %1/%4/%5/%3 + mova m%5, m%1 + punpcklwd m%1, m%2 + punpckhwd m%5, m%2 + mova m%2, m%3 + punpcklwd m%3, m%4 + punpckhwd m%2, m%4 + mova m%4, m%1 + punpckldq m%1, m%3 ;col0 + punpckhdq m%4, m%3 ;col1 + mova m%3, m%5 + punpckldq m%5, m%2 ;col2 + punpckhdq m%3, m%2 ;col3 + SWAP %4, %2 + SWAP %4, %5 + SWAP %4, %3 +%endmacro + +INIT_MMX +cglobal vp8_idct_add_mmx, 3, 3 + ; load block data + movq m0, [r1] + movq m1, [r1+8] + movq m2, [r1+16] + movq m3, [r1+24] + movq m6, [pw_20091] + movq m7, [pw_17734] + + ; actual IDCT + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, [pw_4] + VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 + TRANSPOSE4x4W 0, 1, 2, 3, 4 + + ; store + pxor m4, m4 + lea r1, [r0+2*r2] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 + + RET + ;----------------------------------------------------------------------------- ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) ;----------------------------------------------------------------------------- diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm index 674ca1aab1..3051a7f147 100644 --- a/libavcodec/x86/x86util.asm +++ b/libavcodec/x86/x86util.asm @@ -365,3 +365,18 @@ packuswb %1, %1 movh %4, %1 %endmacro + +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride + movh %3, [%7] + movh %4, [%7+%8] + punpcklbw %3, %5 + punpcklbw %4, %5 + psraw %1, %6 + psraw %2, %6 + paddw %3, %1 + paddw %4, %2 + packuswb %3, %5 + packuswb %4, %5 + movh [%7], %3 + movh [%7+%8], %4 +%endmacro