vp9/x86: idct_32x32_add_ssse3 sub-8x8-idct.

Runtime of the full 32x32 idct goes from 2446 to 2441 cycles (intra) or
from 1425 to 1306 cycles (inter). Overall runtime is not significantly
affected.
This commit is contained in:
Ronald S. Bultje 2014-01-05 11:18:47 -05:00
parent 37b001d14d
commit 04a187fb2a
1 changed files with 107 additions and 2 deletions

View File

@ -74,13 +74,22 @@ SECTION .text
psrad m%2, 14 psrad m%2, 14
%endmacro %endmacro
%macro VP9_UNPACK_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1, tmp2 %macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
%if %0 == 7
punpckhwd m%6, m%2, m%1 punpckhwd m%6, m%2, m%1
VP9_MULSUB_2W_2X %7, %6, %6, %5, [pw_m%3_%4], [pw_%4_%3] VP9_MULSUB_2W_2X %7, %6, %6, %5, [pw_m%3_%4], [pw_%4_%3]
punpcklwd m%2, m%1 punpcklwd m%2, m%1
VP9_MULSUB_2W_2X %1, %2, %2, %5, [pw_m%3_%4], [pw_%4_%3] VP9_MULSUB_2W_2X %1, %2, %2, %5, [pw_m%3_%4], [pw_%4_%3]
packssdw m%1, m%7 packssdw m%1, m%7
packssdw m%2, m%6 packssdw m%2, m%6
%else
punpckhwd m%8, m%4, m%3
VP9_MULSUB_2W_2X %9, %8, %8, %7, [pw_m%5_%6], [pw_%6_%5]
punpcklwd m%2, m%4, m%3
VP9_MULSUB_2W_2X %1, %2, %2, %7, [pw_m%5_%6], [pw_%6_%5]
packssdw m%1, m%9
packssdw m%2, m%8
%endif
%endmacro %endmacro
%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst %macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
@ -381,6 +390,32 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
; SUMSUB_BA w, 6, 9, 15 ; t6, t9 ; SUMSUB_BA w, 6, 9, 15 ; t6, t9
; SUMSUB_BA w, 7, 8, 15 ; t7, t8 ; SUMSUB_BA w, 7, 8, 15 ; t7, t8
%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch %macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
%if %2 <= 4
mova m3, [%1+ 1*%3] ; IN(1)
mova m12, [%1+ 2*%3] ; IN(2)
mova m0, [%1+ 3*%3] ; IN(3)
pmulhrsw m15, m12, [pw_16069x2] ; t6-7
pmulhrsw m12, [pw_3196x2] ; t4-5
pmulhrsw m4, m3, [pw_16305x2] ; t14-15
pmulhrsw m3, [pw_1606x2] ; t8-9
pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
pmulhrsw m0, [pw_15679x2] ; t12-13
; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
paddw m14, m15, m12
psubw m13, m15, m12
pmulhrsw m13, [pw_11585x2] ; t5
pmulhrsw m14, [pw_11585x2] ; t6
VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
%else
mova m5, [%1+ 1*%3] ; IN(1) mova m5, [%1+ 1*%3] ; IN(1)
mova m14, [%1+ 2*%3] ; IN(2) mova m14, [%1+ 2*%3] ; IN(2)
mova m6, [%1+ 3*%3] ; IN(3) mova m6, [%1+ 3*%3] ; IN(3)
@ -442,6 +477,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
pmulhrsw m14, [pw_11585x2] ; t6 pmulhrsw m14, [pw_11585x2] ; t6
VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14 VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13 VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
%endif
; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7 ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15 ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
@ -468,6 +504,17 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15 ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
; from load/start ; from load/start
%if %2 <= 4
mova m11, [%1+ 0*%3] ; IN(0)
pmulhrsw m11, [pw_11585x2] ; t0-t3
psubw m8, m11, m15
paddw m15, m11
psubw m9, m11, m14
paddw m14, m11
psubw m10, m11, m13
paddw m13, m11
%else
mova m10, [%1+ 0*%3] ; IN(0) mova m10, [%1+ 0*%3] ; IN(0)
%if %2 <= 8 %if %2 <= 8
pmulhrsw m10, [pw_11585x2] ; t0 and t1 pmulhrsw m10, [pw_11585x2] ; t0 and t1
@ -490,6 +537,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
SUMSUB_BA w, 15, 8, 7 ; t0, t7 SUMSUB_BA w, 15, 8, 7 ; t0, t7
SUMSUB_BA w, 14, 9, 7 ; t1, t6 SUMSUB_BA w, 14, 9, 7 ; t1, t6
SUMSUB_BA w, 13, 10, 7 ; t2, t5 SUMSUB_BA w, 13, 10, 7 ; t2, t5
%endif
SUMSUB_BA w, 12, 11, 7 ; t3, t4 SUMSUB_BA w, 12, 11, 7 ; t3, t4
SUMSUB_BA w, 0, 15, 7 ; t0, t15 SUMSUB_BA w, 0, 15, 7 ; t0, t15
@ -716,6 +764,37 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
mova [rsp+26*%%str], m14 ; t14 mova [rsp+26*%%str], m14 ; t14
; then, secondly, do t16-31 ; then, secondly, do t16-31
%if %3 <= 8
mova m4, [%1+ 1*64]
mova m3, [%1+ 3*64]
mova m0, [%1+ 5*64]
mova m7, [%1+ 7*64]
pmulhrsw m11, m4, [pw_16364x2] ;t31
pmulhrsw m4, [pw_804x2] ;t16
pmulhrsw m8, m7, [pw_m5520x2] ;t19
pmulhrsw m7, [pw_15426x2] ;t28
pmulhrsw m15, m0, [pw_15893x2] ;t27
pmulhrsw m0, [pw_3981x2] ;t20
pmulhrsw m12, m3, [pw_m2404x2] ;t23
pmulhrsw m3, [pw_16207x2] ;t24
; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30
VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
; from 1 stage forward
SUMSUB_BA w, 8, 4, 1
; temporary storage
mova [rsp+17*%%str], m8 ; t16
mova [rsp+21*%%str], m4 ; t19
VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
%else
mova m10, [%1+ 1*64] mova m10, [%1+ 1*64]
mova m13, [%1+ 3*64] mova m13, [%1+ 3*64]
mova m14, [%1+ 5*64] mova m14, [%1+ 5*64]
@ -793,6 +872,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29 VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26 VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25 VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
%endif
; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23, ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31 ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
@ -1029,8 +1109,10 @@ INIT_XMM ssse3
cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
cmp eobd, 135 cmp eobd, 135
jg .idctfull jg .idctfull
cmp eobd, 1 cmp eobd, 34
jg .idct16x16 jg .idct16x16
cmp eobd, 1
jg .idct8x8
; dc-only case ; dc-only case
movd m0, [blockq] movd m0, [blockq]
@ -1050,6 +1132,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
RET RET
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2 DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
.idct8x8:
VP9_IDCT32_1D blockq, 1, 8
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32
mov cntd, 4
sub stride30q, stride2q ; stride*30
.loop2_8x8:
mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q]
VP9_IDCT32_1D rsp, 2, 8
add dst_bakq, 8
add rsp, 16
dec cntd
jg .loop2_8x8
sub rsp, 64
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
ZERO_BLOCK blockq, 64, 8, m7
RET
.idct16x16: .idct16x16:
mov cntd, 2 mov cntd, 2
.loop1_16x16: .loop1_16x16: