From 4b66274a86ddcabf15c51e45ebd6f144387130a6 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 11 Sep 2015 12:00:50 -0400 Subject: [PATCH] vp9: save one (PSIGNW) instruction in iadst16_1d sse2/ssse3. --- libavcodec/x86/vp9itxfm.asm | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index 54462a6220..03a43d199c 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -29,6 +29,7 @@ pw_11585x2: times 8 dw 23170 pw_m11585x2: times 8 dw -23170 pw_m11585_11585: times 4 dw -11585, 11585 pw_11585_11585: times 8 dw 11585 +pw_m11585_m11585: times 8 dw -11585 %macro VP9_IDCT_COEFFS 2-3 0 pw_%1x2: times 8 dw %1*2 @@ -1716,13 +1717,13 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx SUMSUB_BA w, 7, 6, 4 pmulhrsw m7, [pw_m11585x2] ; m8=out7[w] pmulhrsw m6, [pw_11585x2] ; m1=out8[w] + SWAP 6, 7 SUMSUB_BA w, 3, 2, 4 pmulhrsw m3, [pw_11585x2] ; m3=out4[w] pmulhrsw m2, [pw_11585x2] ; m2=out11[w] %else SCRATCH 5, 8, tmpq+10*%%str - PSIGNW m7, [pw_m1] - VP9_UNPACK_MULSUB_2W_4X 7, 6, 11585, 11585, [pd_8192], 5, 4 + VP9_UNPACK_MULSUB_2W_4X 6, 7, 11585, m11585, [pd_8192], 5, 4 VP9_UNPACK_MULSUB_2W_4X 2, 3, 11585, 11585, [pd_8192], 5, 4 UNSCRATCH 5, 8, tmpq+10*%%str %endif @@ -1733,7 +1734,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx %if %2 == 1 %if ARCH_X86_64 mova m13, [tmpq+ 6*%%str] - TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 7, 10 + TRANSPOSE8x8W 1, 11, 14, 0, 3, 15, 13, 6, 10 mova [tmpq+ 0*16], m1 mova [tmpq+ 2*16], m11 mova [tmpq+ 4*16], m14 @@ -1745,10 +1746,10 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx mova [tmpq+ 8*16], m3 mova [tmpq+10*16], m15 mova [tmpq+12*16], m13 - mova [tmpq+14*16], m7 + mova [tmpq+14*16], m6 - TRANSPOSE8x8W 6, 1, 11, 2, 9, 14, 0, 5, 10 - mova [tmpq+ 1*16], m6 + TRANSPOSE8x8W 7, 1, 11, 2, 9, 14, 0, 5, 10 + mova [tmpq+ 1*16], m7 mova [tmpq+ 3*16], m1 mova [tmpq+ 5*16], m11 mova [tmpq+ 7*16], m2 @@ -1759,20 +1760,20 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx %else mova [tmpq+12*%%str], m2 mova [tmpq+ 1*%%str], m5 - mova [tmpq+15*%%str], m6 + mova [tmpq+15*%%str], m7 mova m2, [tmpq+ 9*%%str] mova m5, [tmpq+ 5*%%str] - mova m6, [tmpq+ 8*%%str] - TRANSPOSE8x8W 1, 2, 5, 0, 3, 6, 4, 7, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 + mova m7, [tmpq+ 8*%%str] + TRANSPOSE8x8W 1, 2, 5, 0, 3, 7, 4, 6, [tmpq+ 6*%%str], [tmpq+ 8*%%str], 1 mova [tmpq+ 0*16], m1 mova [tmpq+ 2*16], m2 mova [tmpq+ 4*16], m5 mova [tmpq+ 6*16], m0 - mova [tmpq+10*16], m6 + mova [tmpq+10*16], m7 mova m3, [tmpq+12*%%str] mova [tmpq+12*16], m4 mova m4, [tmpq+14*%%str] - mova [tmpq+14*16], m7 + mova [tmpq+14*16], m6 mova m0, [tmpq+15*%%str] mova m1, [tmpq+ 3*%%str] @@ -1805,7 +1806,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx lea dstq, [dstq+strideq*2] VP9_IDCT8_WRITEx2 3, 15, 10, 8, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] - VP9_IDCT8_WRITEx2 12, 7, 10, 8, 4, ROUND_REG, 6 + VP9_IDCT8_WRITEx2 12, 6, 10, 8, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] mova m1, [tmpq+ 3*%%str] @@ -1813,7 +1814,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx mova m14, [tmpq+11*%%str] mova m0, [tmpq+13*%%str] - VP9_IDCT8_WRITEx2 6, 1, 10, 8, 4, ROUND_REG, 6 + VP9_IDCT8_WRITEx2 7, 1, 10, 8, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] VP9_IDCT8_WRITEx2 11, 2, 10, 8, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] @@ -1823,9 +1824,9 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx %else mova [tmpq+ 0*%%str], m2 mova [tmpq+ 1*%%str], m5 - mova [tmpq+ 2*%%str], m6 + mova [tmpq+ 2*%%str], m7 mova m2, [tmpq+ 9*%%str] - VP9_IDCT8_WRITEx2 1, 2, 5, 6, 4, ROUND_REG, 6 + VP9_IDCT8_WRITEx2 1, 2, 5, 7, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] mova m5, [tmpq+ 5*%%str] VP9_IDCT8_WRITEx2 5, 0, 1, 2, 4, ROUND_REG, 6 @@ -1834,7 +1835,7 @@ VP9_IDCT_IDCT_16x16_ADD_XMM avx VP9_IDCT8_WRITEx2 3, 5, 1, 2, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] mova m5, [tmpq+ 6*%%str] - VP9_IDCT8_WRITEx2 5, 7, 1, 2, 4, ROUND_REG, 6 + VP9_IDCT8_WRITEx2 5, 6, 1, 2, 4, ROUND_REG, 6 lea dstq, [dstq+strideq*2] mova m0, [tmpq+ 2*%%str]