vp9/x86: 16x16 iadst_idct, idct_iadst and iadst_iadst (ssse3+avx).

Sample timings on ped1080p.webm (of the ssse3 functions):
iadst_idct:  4672 -> 1175 cycles
idct_iadst:  4736 -> 1263 cycles
iadst_iadst: 4924 -> 1438 cycles
Total decoding time changed from 6.565s to 6.413s.
This commit is contained in:
Ronald S. Bultje 2014-01-16 07:30:15 -05:00 committed by Clément Bœsch
parent a64333db5f
commit 8173d1ffc0
2 changed files with 293 additions and 13 deletions

View File

@ -157,13 +157,25 @@ filters_8tap_1d_fn3(avg)
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_8x8_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_16x16_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
void ff_vp9_idct_idct_32x32_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
#define itxfm_func(typea, typeb, size, opt) \
void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
int16_t *block, int eob)
#define itxfm_funcs(size, opt) \
itxfm_func(idct, idct, size, opt); \
itxfm_func(iadst, idct, size, opt); \
itxfm_func(idct, iadst, size, opt); \
itxfm_func(iadst, iadst, size, opt)
itxfm_func(idct, idct, 4, ssse3);
itxfm_func(idct, idct, 8, ssse3);
itxfm_func(idct, idct, 8, avx);
itxfm_funcs(16, ssse3);
itxfm_funcs(16, avx);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
#undef itxfm_func
#undef itxfm_funcs
void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
@ -226,7 +238,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
if (ARCH_X86_64) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
@ -240,6 +255,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
if (ARCH_X86_64) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =

View File

@ -25,9 +25,10 @@
SECTION_RODATA
pw_11585x2: times 8 dw 23170
pw_11585x2: times 8 dw 23170
pw_m11585x2: times 8 dw -23170
%macro VP9_IDCT_COEFFS 2
%macro VP9_IDCT_COEFFS 2-3 0
pw_%1x2: times 8 dw %1*2
pw_m%1x2: times 8 dw -%1*2
pw_%2x2: times 8 dw %2*2
@ -35,11 +36,15 @@ pw_m%2x2: times 8 dw -%2*2
pw_m%1_%2: times 4 dw -%1, %2
pw_%2_%1: times 4 dw %2, %1
pw_m%2_m%1: times 4 dw -%2, -%1
%if %3 == 1
pw_m%2_%1: times 4 dw -%2, %1
pw_%1_%2: times 4 dw %1, %2
%endif
%endmacro
VP9_IDCT_COEFFS 15137, 6270
VP9_IDCT_COEFFS 16069, 3196
VP9_IDCT_COEFFS 9102, 13623
VP9_IDCT_COEFFS 15137, 6270, 1
VP9_IDCT_COEFFS 16069, 3196, 1
VP9_IDCT_COEFFS 9102, 13623, 1
VP9_IDCT_COEFFS 16305, 1606
VP9_IDCT_COEFFS 10394, 12665
VP9_IDCT_COEFFS 14449, 7723
@ -57,6 +62,7 @@ pd_8192: times 4 dd 8192
pw_2048: times 8 dw 2048
pw_1024: times 8 dw 1024
pw_512: times 8 dw 512
pw_m1: times 8 dw -1
SECTION .text
@ -735,6 +741,262 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
VP9_IDCT_IDCT_16x16_ADD_XMM ssse3
VP9_IDCT_IDCT_16x16_ADD_XMM avx
;---------------------------------------------------------------------------------------------
; void vp9_iadst_iadst_16x16_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;---------------------------------------------------------------------------------------------
%macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2
punpckhwd m%4, m%2, m%1
punpcklwd m%2, m%1
pmaddwd m%3, m%4, [pw_m%5_%6]
pmaddwd m%4, [pw_%6_%5]
pmaddwd m%1, m%2, [pw_m%5_%6]
pmaddwd m%2, [pw_%6_%5]
%endmacro
%macro VP9_RND_SH_SUMSUB_BA 6 ; dst1 [src1], dst2 [src2], src3, src4, tmp, round
SUMSUB_BA d, %1, %2, %5
SUMSUB_BA d, %3, %4, %5
paddd m%1, %6
paddd m%2, %6
paddd m%3, %6
paddd m%4, %6
psrad m%1, 14
psrad m%2, 14
psrad m%3, 14
psrad m%4, 14
packssdw m%1, m%3
packssdw m%2, m%4
%endmacro
%macro VP9_IADST16_1D 2 ; src, pass
%assign %%str 16*%2
mova m0, [%1+ 0*32] ; in0
mova m1, [%1+15*32] ; in15
mova m8, [%1+ 7*32] ; in7
mova m9, [%1+ 8*32] ; in8
VP9_UNPACK_MULSUB_2D_4X 1, 0, 2, 3, 16364, 804 ; m1/2=t1[d], m0/3=t0[d]
VP9_UNPACK_MULSUB_2D_4X 8, 9, 11, 10, 11003, 12140 ; m8/11=t9[d], m9/10=t8[d]
VP9_RND_SH_SUMSUB_BA 9, 0, 10, 3, 4, [pd_8192] ; m9=t0[w], m0=t8[w]
VP9_RND_SH_SUMSUB_BA 8, 1, 11, 2, 4, [pd_8192] ; m8=t1[w], m1=t9[w]
mova m11, [%1+ 2*32] ; in2
mova m10, [%1+13*32] ; in13
mova m3, [%1+ 5*32] ; in5
mova m2, [%1+10*32] ; in10
VP9_UNPACK_MULSUB_2D_4X 10, 11, 6, 7, 15893, 3981 ; m10/6=t3[d], m11/7=t2[d]
VP9_UNPACK_MULSUB_2D_4X 3, 2, 4, 5, 8423, 14053 ; m3/4=t11[d], m2/5=t10[d]
VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w]
VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w]
mova [rsp+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
mova m4, [%1+ 4*32] ; in4
mova m5, [%1+11*32] ; in11
mova m12, [%1+ 3*32] ; in3
mova m13, [%1+12*32] ; in12
VP9_UNPACK_MULSUB_2D_4X 5, 4, 7, 6, 14811, 7005 ; m5/7=t5[d], m4/6=t4[d]
VP9_UNPACK_MULSUB_2D_4X 12, 13, 14, 15, 5520, 15426 ; m12/14=t13[d], m13/15=t12[d]
VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w]
VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w]
mova [rsp+ 2*%%str], m8 ; t1:m9->r2
mova [rsp+ 3*%%str], m2 ; t2:m2->r3
mova [rsp+ 4*%%str], m3 ; t3:m3->r4
mova [rsp+ 5*%%str], m13 ; t4:m13->r5
mova m2, [%1+ 6*32] ; in6
mova m3, [%1+ 9*32] ; in9
mova m8, [%1+ 1*32] ; in1
mova m9, [%1+14*32] ; in14
VP9_UNPACK_MULSUB_2D_4X 3, 2, 7, 6, 13160, 9760 ; m3/7=t7[d], m2/6=t6[d]
VP9_UNPACK_MULSUB_2D_4X 8, 9, 13, 14, 2404, 16207 ; m8/13=t15[d], m9/14=t14[d]
VP9_RND_SH_SUMSUB_BA 9, 2, 14, 6, 15, [pd_8192] ; m9=t6[w], m2=t14[w]
VP9_RND_SH_SUMSUB_BA 8, 3, 13, 7, 15, [pd_8192] ; m8=t7[w], m3=t15[w]
; r0=t0, r2=t1, r3=t2, r4=t3, r5=t4, m12=t5, m9=t6, m8=t7
; m0=t8, m1=t9, m11=t10, m10=t11, m4=t12, m5=t13, m2=t14, m3=t15
; handle t8-15 first
VP9_UNPACK_MULSUB_2D_4X 0, 1, 6, 7, 16069, 3196 ; m1/7=t8[d], m0/6=t9[d]
VP9_UNPACK_MULSUB_2D_4X 5, 4, 13, 14, 3196, 16069 ; m5/13=t12[d], m4/14=t13[d]
VP9_RND_SH_SUMSUB_BA 5, 1, 13, 7, 15, [pd_8192] ; m5=t8[w], m1=t12[w]
VP9_RND_SH_SUMSUB_BA 4, 0, 14, 6, 15, [pd_8192] ; m4=t9[w], m0=t13[w]
VP9_UNPACK_MULSUB_2D_4X 11, 10, 6, 7, 9102, 13623 ; m11/6=t11[d], m10/7=t10[d]
VP9_UNPACK_MULSUB_2D_4X 3, 2, 13, 14, 13623, 9102 ; m3/13=t14[d], m2/14=t15[d]
VP9_RND_SH_SUMSUB_BA 3, 10, 13, 7, 15, [pd_8192] ; m3=t10[w], m10=t14[w]
VP9_RND_SH_SUMSUB_BA 2, 11, 14, 6, 15, [pd_8192] ; m2=t11[w], m11=t15[w]
; m5=t8, m4=t9, m3=t10, m2=t11, m1=t12, m0=t13, m10=t14, m11=t15
VP9_UNPACK_MULSUB_2D_4X 1, 0, 6, 7, 15137, 6270 ; m1/6=t13[d], m0/7=t12[d]
VP9_UNPACK_MULSUB_2D_4X 11, 10, 13, 14, 6270, 15137 ; m11/13=t14[d], m10/14=t15[d]
VP9_RND_SH_SUMSUB_BA 11, 0, 13, 7, 15, [pd_8192] ; m11=out2[w], m0=t14[w]
VP9_RND_SH_SUMSUB_BA 10, 1, 14, 6, 15, [pd_8192]
psignw m10, [pw_m1] ; m10=out13[w], m1=t15[w]
SUMSUB_BA w, 3, 5, 15
psignw m3, [pw_m1] ; m3=out1[w], m5=t10[w]
SUMSUB_BA w, 2, 4, 15 ; m2=out14[w], m4=t11[w]
SUMSUB_BA w, 5, 4, 15
pmulhrsw m5, [pw_11585x2] ; m5=out6[w]
pmulhrsw m4, [pw_11585x2] ; m4=out9[w]
SUMSUB_BA w, 1, 0, 15
pmulhrsw m1, [pw_m11585x2] ; m1=out5[w]
pmulhrsw m0, [pw_11585x2] ; m0=out10[w]
; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
mova m6, [rsp+ 0*%%str]
mova m7, [rsp+ 2*%%str]
mova m13, [rsp+ 3*%%str]
mova m14, [rsp+ 4*%%str]
mova m15, [rsp+ 5*%%str]
mova [rsp+ 8*%%str], m5
mova [rsp+ 9*%%str], m4
mova [rsp+10*%%str], m0
mova [rsp+11*%%str], m10
mova [rsp+12*%%str], m2
; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
SUMSUB_BA w, 15, 6, 0 ; m15=t0[w], m6=t4[w]
SUMSUB_BA w, 12, 7, 0 ; m12=t1[w], m7=t5[w]
SUMSUB_BA w, 9, 13, 0 ; m9=t2[w], m13=t6[w]
SUMSUB_BA w, 8, 14, 0 ; m8=t3[w], m14=t7[w]
VP9_UNPACK_MULSUB_2D_4X 6, 7, 0, 2, 15137, 6270 ; m6/0=t5[d], m7/2=t4[d]
VP9_UNPACK_MULSUB_2D_4X 14, 13, 4, 5, 6270, 15137 ; m14/4=t6[d], m13/5=t7[d]
VP9_RND_SH_SUMSUB_BA 14, 7, 4, 2, 10, [pd_8192]
psignw m14, [pw_m1] ; m14=out3[w], m7=t6[w]
VP9_RND_SH_SUMSUB_BA 13, 6, 5, 0, 10, [pd_8192] ; m13=out12[w], m6=t7[w]
SUMSUB_BA w, 9, 15, 10 ; m9=out0[w], m15=t2[w]
SUMSUB_BA w, 8, 12, 10
psignw m8, [pw_m1] ; m8=out15[w], m12=t3[w]
SUMSUB_BA w, 12, 15, 10
pmulhrsw m12, [pw_m11585x2] ; m12=out7[w]
pmulhrsw m15, [pw_11585x2] ; m15=out8[w]
SUMSUB_BA w, 7, 6, 10
pmulhrsw m7, [pw_11585x2] ; m7=out4[w]
pmulhrsw m6, [pw_11585x2] ; m6=out11[w]
; m9=out0, m14=out3, m7=out4, m12=out7, m15=out8, m6=out11, m13=out12, m8=out15
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
%if %2 == 1
mova m0, [rsp+ 8*%%str]
TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2
mova [rsp+ 0*16], m9
mova [rsp+ 2*16], m3
mova [rsp+ 4*16], m11
mova [rsp+ 6*16], m14
mova m9, [rsp+ 9*%%str]
mova m3, [rsp+10*%%str]
mova m11, [rsp+11*%%str]
mova m14, [rsp+12*%%str]
mova [rsp+ 8*16], m7
mova [rsp+10*16], m1
mova [rsp+12*16], m0
mova [rsp+14*16], m12
TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2
mova [rsp+ 1*16], m15
mova [rsp+ 3*16], m9
mova [rsp+ 5*16], m3
mova [rsp+ 7*16], m6
mova [rsp+ 9*16], m13
mova [rsp+11*16], m11
mova [rsp+13*16], m14
mova [rsp+15*16], m8
%else
mova m5, [rsp+ 8*%%str]
pxor m0, m0
pmulhrsw m9, [pw_512]
pmulhrsw m3, [pw_512]
VP9_STORE_2X 9, 3, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m11, [pw_512]
pmulhrsw m14, [pw_512]
VP9_STORE_2X 11, 14, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m7, [pw_512]
pmulhrsw m1, [pw_512]
VP9_STORE_2X 7, 1, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m5, [pw_512]
pmulhrsw m12, [pw_512]
VP9_STORE_2X 5, 12, 2, 4, 0
lea dstq, [dstq+strideq*2]
mova m9, [rsp+ 9*%%str]
mova m3, [rsp+10*%%str]
mova m11, [rsp+11*%%str]
mova m14, [rsp+12*%%str]
pmulhrsw m15, [pw_512]
pmulhrsw m9, [pw_512]
VP9_STORE_2X 15, 9, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m3, [pw_512]
pmulhrsw m6, [pw_512]
VP9_STORE_2X 3, 6, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m13, [pw_512]
pmulhrsw m11, [pw_512]
VP9_STORE_2X 13, 11, 2, 4, 0
lea dstq, [dstq+strideq*2]
pmulhrsw m14, [pw_512]
pmulhrsw m8, [pw_512]
VP9_STORE_2X 14, 8, 2, 4, 0
%endif
%endmacro
%macro IADST16_FN 5
INIT_XMM %5
cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob
; potential eob checks go here
DEFINE_ARGS dst, stride, block, cnt, dst_bak
mov cntd, 2
.loop1_full:
VP9_%2_1D blockq, 1
add blockq, 16
add rsp, 256
dec cntd
jg .loop1_full
sub blockq, 32
sub rsp, 512
mov cntd, 2
mov dst_bakq, dstq
.loop2_full:
VP9_%4_1D rsp, 2
lea dstq, [dst_bakq+8]
add rsp, 16
dec cntd
jg .loop2_full
sub rsp, 32
; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients
ZERO_BLOCK blockq, 32, 16, m0
RET
%endmacro
IADST16_FN idct, IDCT16, iadst, IADST16, ssse3
IADST16_FN idct, IDCT16, iadst, IADST16, avx
IADST16_FN iadst, IADST16, idct, IDCT16, ssse3
IADST16_FN iadst, IADST16, idct, IDCT16, avx
IADST16_FN iadst, IADST16, iadst, IADST16, ssse3
IADST16_FN iadst, IADST16, iadst, IADST16, avx
;---------------------------------------------------------------------------------------------
; void vp9_idct_idct_32x32_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;---------------------------------------------------------------------------------------------