From 8476ca3b4ee5455eceb6ea6e58ea8200d63dff9d Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 3 Mar 2012 08:04:40 -0800 Subject: [PATCH] vp8: convert idct x86 assembly to use named arguments. --- libavcodec/x86/vp8dsp.asm | 200 ++++++++++++++++++++------------------ 1 file changed, 103 insertions(+), 97 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 2f45f5a478..74456c687e 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -906,10 +906,10 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height ;----------------------------------------------------------------------------- %macro ADD_DC 4 - %4 m2, [r0+%3] - %4 m3, [r0+r2+%3] - %4 m4, [r1+%3] - %4 m5, [r1+r2+%3] + %4 m2, [dst1q+%3] + %4 m3, [dst1q+strideq+%3] + %4 m4, [dst2q+%3] + %4 m5, [dst2q+strideq+%3] paddusb m2, %1 paddusb m3, %1 paddusb m4, %1 @@ -918,22 +918,22 @@ cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height psubusb m3, %2 psubusb m4, %2 psubusb m5, %2 - %4 [r0+%3], m2 - %4 [r0+r2+%3], m3 - %4 [r1+%3], m4 - %4 [r1+r2+%3], m5 + %4 [dst1q+%3], m2 + %4 [dst1q+strideq+%3], m3 + %4 [dst2q+%3], m4 + %4 [dst2q+strideq+%3], m5 %endmacro INIT_MMX mmx -cglobal vp8_idct_dc_add, 3, 3 +cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] ; calculate DC paddw m0, [pw_4] pxor m1, m1 psraw m0, 3 - movd [r1], m1 + movd [blockq], m1 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 @@ -943,24 +943,26 @@ cglobal vp8_idct_dc_add, 3, 3 punpcklwd m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, movh RET INIT_XMM sse4 -cglobal vp8_idct_dc_add, 3, 3, 6 +cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1] + movd m0, [blockq] pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1], m1 - lea r1, [r0+r2*2] - movd m2, [r0] - movd m3, [r0+r2] - movd m4, [r1] - movd m5, [r1+r2] + movd [blockq], m1 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] + movd m2, [dst1q] + movd m3, [dst1q+strideq] + movd m4, [dst2q] + movd m5, [dst2q+strideq] psraw m0, 3 pshuflw m0, m0, 0 punpcklqdq m0, m0 @@ -971,10 +973,10 @@ cglobal vp8_idct_dc_add, 3, 3, 6 paddw m2, m0 paddw m4, m0 packuswb m2, m4 - movd [r0], m2 - pextrd [r0+r2], m2, 1 - pextrd [r1], m2, 2 - pextrd [r1+r2], m2, 3 + movd [dst1q], m2 + pextrd [dst1q+strideq], m2, 1 + pextrd [dst2q], m2, 2 + pextrd [dst2q+strideq], m2, 3 RET ;----------------------------------------------------------------------------- @@ -983,21 +985,21 @@ cglobal vp8_idct_dc_add, 3, 3, 6 %if ARCH_X86_32 INIT_MMX mmx -cglobal vp8_idct_dc_add4y, 3, 3 +cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1012,28 +1014,29 @@ cglobal vp8_idct_dc_add4y, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova ADD_DC m1, m7, 8, mova RET %endif INIT_XMM sse2 -cglobal vp8_idct_dc_add4y, 3, 3, 6 +cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m1, m1 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m1 - movd [r1+32*1], m1 - movd [r1+32*2], m1 - movd [r1+32*3], m1 + movd [blockq+32*0], m1 + movd [blockq+32*1], m1 + movd [blockq+32*2], m1 + movd [blockq+32*3], m1 psraw m0, 3 psubw m1, m0 packuswb m0, m0 @@ -1044,7 +1047,8 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6 punpcklbw m1, m1 ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m1, 0, mova RET @@ -1053,21 +1057,21 @@ cglobal vp8_idct_dc_add4y, 3, 3, 6 ;----------------------------------------------------------------------------- INIT_MMX mmx -cglobal vp8_idct_dc_add4uv, 3, 3 +cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride ; load data - movd m0, [r1+32*0] ; A - movd m1, [r1+32*2] ; C - punpcklwd m0, [r1+32*1] ; A B - punpcklwd m1, [r1+32*3] ; C D + movd m0, [blockq+32*0] ; A + movd m1, [blockq+32*2] ; C + punpcklwd m0, [blockq+32*1] ; A B + punpcklwd m1, [blockq+32*3] ; C D punpckldq m0, m1 ; A B C D pxor m6, m6 ; calculate DC paddw m0, [pw_4] - movd [r1+32*0], m6 - movd [r1+32*1], m6 - movd [r1+32*2], m6 - movd [r1+32*3], m6 + movd [blockq+32*0], m6 + movd [blockq+32*1], m6 + movd [blockq+32*2], m6 + movd [blockq+32*3], m6 psraw m0, 3 psubw m6, m0 packuswb m0, m0 @@ -1082,10 +1086,11 @@ cglobal vp8_idct_dc_add4uv, 3, 3 punpckhbw m7, m7 ; CCCCDDDD ; add DC - lea r1, [r0+r2*2] + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+strideq*2] ADD_DC m0, m6, 0, mova - lea r0, [r0+r2*4] - lea r1, [r1+r2*4] + lea dst1q, [dst1q+strideq*4] + lea dst2q, [dst2q+strideq*4] ADD_DC m1, m7, 0, mova RET @@ -1125,24 +1130,24 @@ cglobal vp8_idct_dc_add4uv, 3, 3 %endmacro %macro VP8_IDCT_ADD 0 -cglobal vp8_idct_add, 3, 3 +cglobal vp8_idct_add, 3, 3, 0, dst, block, stride ; load block data - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] + movq m0, [blockq+ 0] + movq m1, [blockq+ 8] + movq m2, [blockq+16] + movq m3, [blockq+24] movq m6, [pw_20091] movq m7, [pw_17734] %if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [blockq+ 0], xmm0 + movaps [blockq+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [blockq+ 0], m4 + movq [blockq+ 8], m4 + movq [blockq+16], m4 + movq [blockq+24], m4 %endif ; actual IDCT @@ -1154,9 +1159,10 @@ cglobal vp8_idct_add, 3, 3 ; store pxor m4, m4 - lea r1, [r0+2*r2] - STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 - STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 + DEFINE_ARGS dst1, dst2, stride + lea dst2q, [dst1q+2*strideq] + STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq + STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq RET %endmacro @@ -1173,24 +1179,24 @@ VP8_IDCT_ADD ;----------------------------------------------------------------------------- %macro SCATTER_WHT 3 - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(0+%3)], r1w - mov [r0+2*16*(1+%3)], r2w - shr r1d, 16 - shr r2d, 16 + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(0+%3)], dc1w + mov [blockq+2*16*(1+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 psrlq m%1, 32 psrlq m%2, 32 - mov [r0+2*16*(4+%3)], r1w - mov [r0+2*16*(5+%3)], r2w - movd r1d, m%1 - movd r2d, m%2 - mov [r0+2*16*(8+%3)], r1w - mov [r0+2*16*(9+%3)], r2w - shr r1d, 16 - shr r2d, 16 - mov [r0+2*16*(12+%3)], r1w - mov [r0+2*16*(13+%3)], r2w + mov [blockq+2*16*(4+%3)], dc1w + mov [blockq+2*16*(5+%3)], dc2w + movd dc1d, m%1 + movd dc2d, m%2 + mov [blockq+2*16*(8+%3)], dc1w + mov [blockq+2*16*(9+%3)], dc2w + shr dc1d, 16 + shr dc2d, 16 + mov [blockq+2*16*(12+%3)], dc1w + mov [blockq+2*16*(13+%3)], dc2w %endmacro %macro HADAMARD4_1D 4 @@ -1200,21 +1206,21 @@ VP8_IDCT_ADD %endmacro %macro VP8_DC_WHT 0 -cglobal vp8_luma_dc_wht, 2, 3 - movq m0, [r1] - movq m1, [r1+8] - movq m2, [r1+16] - movq m3, [r1+24] +cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 + movq m0, [dc1q] + movq m1, [dc1q+8] + movq m2, [dc1q+16] + movq m3, [dc1q+24] %if cpuflag(sse) xorps xmm0, xmm0 - movaps [r1+ 0], xmm0 - movaps [r1+16], xmm0 + movaps [dc1q+ 0], xmm0 + movaps [dc1q+16], xmm0 %else pxor m4, m4 - movq [r1+ 0], m4 - movq [r1+ 8], m4 - movq [r1+16], m4 - movq [r1+24], m4 + movq [dc1q+ 0], m4 + movq [dc1q+ 8], m4 + movq [dc1q+16], m4 + movq [dc1q+24], m4 %endif HADAMARD4_1D 0, 1, 2, 3 TRANSPOSE4x4W 0, 1, 2, 3, 4