mirror of https://git.ffmpeg.org/ffmpeg.git
vp9/x86: idct_32x32_add_ssse3 sub-8x8-idct.
Runtime of the full 32x32 idct goes from 2446 to 2441 cycles (intra) or from 1425 to 1306 cycles (inter). Overall runtime is not significantly affected.
This commit is contained in:
parent
37b001d14d
commit
04a187fb2a
|
@ -74,13 +74,22 @@ SECTION .text
|
||||||
psrad m%2, 14
|
psrad m%2, 14
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro VP9_UNPACK_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1, tmp2
|
%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
|
||||||
|
%if %0 == 7
|
||||||
punpckhwd m%6, m%2, m%1
|
punpckhwd m%6, m%2, m%1
|
||||||
VP9_MULSUB_2W_2X %7, %6, %6, %5, [pw_m%3_%4], [pw_%4_%3]
|
VP9_MULSUB_2W_2X %7, %6, %6, %5, [pw_m%3_%4], [pw_%4_%3]
|
||||||
punpcklwd m%2, m%1
|
punpcklwd m%2, m%1
|
||||||
VP9_MULSUB_2W_2X %1, %2, %2, %5, [pw_m%3_%4], [pw_%4_%3]
|
VP9_MULSUB_2W_2X %1, %2, %2, %5, [pw_m%3_%4], [pw_%4_%3]
|
||||||
packssdw m%1, m%7
|
packssdw m%1, m%7
|
||||||
packssdw m%2, m%6
|
packssdw m%2, m%6
|
||||||
|
%else
|
||||||
|
punpckhwd m%8, m%4, m%3
|
||||||
|
VP9_MULSUB_2W_2X %9, %8, %8, %7, [pw_m%5_%6], [pw_%6_%5]
|
||||||
|
punpcklwd m%2, m%4, m%3
|
||||||
|
VP9_MULSUB_2W_2X %1, %2, %2, %7, [pw_m%5_%6], [pw_%6_%5]
|
||||||
|
packssdw m%1, m%9
|
||||||
|
packssdw m%2, m%8
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
|
%macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
|
||||||
|
@ -381,6 +390,32 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
|
||||||
; SUMSUB_BA w, 6, 9, 15 ; t6, t9
|
; SUMSUB_BA w, 6, 9, 15 ; t6, t9
|
||||||
; SUMSUB_BA w, 7, 8, 15 ; t7, t8
|
; SUMSUB_BA w, 7, 8, 15 ; t7, t8
|
||||||
%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
|
%macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
|
||||||
|
%if %2 <= 4
|
||||||
|
mova m3, [%1+ 1*%3] ; IN(1)
|
||||||
|
mova m12, [%1+ 2*%3] ; IN(2)
|
||||||
|
mova m0, [%1+ 3*%3] ; IN(3)
|
||||||
|
|
||||||
|
pmulhrsw m15, m12, [pw_16069x2] ; t6-7
|
||||||
|
pmulhrsw m12, [pw_3196x2] ; t4-5
|
||||||
|
pmulhrsw m4, m3, [pw_16305x2] ; t14-15
|
||||||
|
pmulhrsw m3, [pw_1606x2] ; t8-9
|
||||||
|
pmulhrsw m7, m0, [pw_m4756x2] ; t10-11
|
||||||
|
pmulhrsw m0, [pw_15679x2] ; t12-13
|
||||||
|
|
||||||
|
; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
|
||||||
|
; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
|
||||||
|
|
||||||
|
paddw m14, m15, m12
|
||||||
|
psubw m13, m15, m12
|
||||||
|
pmulhrsw m13, [pw_11585x2] ; t5
|
||||||
|
pmulhrsw m14, [pw_11585x2] ; t6
|
||||||
|
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
|
||||||
|
|
||||||
|
; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
|
||||||
|
; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
|
||||||
|
%else
|
||||||
mova m5, [%1+ 1*%3] ; IN(1)
|
mova m5, [%1+ 1*%3] ; IN(1)
|
||||||
mova m14, [%1+ 2*%3] ; IN(2)
|
mova m14, [%1+ 2*%3] ; IN(2)
|
||||||
mova m6, [%1+ 3*%3] ; IN(3)
|
mova m6, [%1+ 3*%3] ; IN(3)
|
||||||
|
@ -442,6 +477,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
|
||||||
pmulhrsw m14, [pw_11585x2] ; t6
|
pmulhrsw m14, [pw_11585x2] ; t6
|
||||||
VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
|
VP9_UNPACK_MULSUB_2W_4X 2, 5, 15137, 6270, [pd_8192], 10, 11 ; t9, t14
|
||||||
VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
|
VP9_UNPACK_MULSUB_2W_4X 6, 1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
|
||||||
|
%endif
|
||||||
|
|
||||||
; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
|
; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
|
||||||
; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
|
; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
|
||||||
|
@ -468,6 +504,17 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
|
||||||
; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
|
; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
|
||||||
|
|
||||||
; from load/start
|
; from load/start
|
||||||
|
%if %2 <= 4
|
||||||
|
mova m11, [%1+ 0*%3] ; IN(0)
|
||||||
|
pmulhrsw m11, [pw_11585x2] ; t0-t3
|
||||||
|
|
||||||
|
psubw m8, m11, m15
|
||||||
|
paddw m15, m11
|
||||||
|
psubw m9, m11, m14
|
||||||
|
paddw m14, m11
|
||||||
|
psubw m10, m11, m13
|
||||||
|
paddw m13, m11
|
||||||
|
%else
|
||||||
mova m10, [%1+ 0*%3] ; IN(0)
|
mova m10, [%1+ 0*%3] ; IN(0)
|
||||||
%if %2 <= 8
|
%if %2 <= 8
|
||||||
pmulhrsw m10, [pw_11585x2] ; t0 and t1
|
pmulhrsw m10, [pw_11585x2] ; t0 and t1
|
||||||
|
@ -490,6 +537,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
|
||||||
SUMSUB_BA w, 15, 8, 7 ; t0, t7
|
SUMSUB_BA w, 15, 8, 7 ; t0, t7
|
||||||
SUMSUB_BA w, 14, 9, 7 ; t1, t6
|
SUMSUB_BA w, 14, 9, 7 ; t1, t6
|
||||||
SUMSUB_BA w, 13, 10, 7 ; t2, t5
|
SUMSUB_BA w, 13, 10, 7 ; t2, t5
|
||||||
|
%endif
|
||||||
SUMSUB_BA w, 12, 11, 7 ; t3, t4
|
SUMSUB_BA w, 12, 11, 7 ; t3, t4
|
||||||
|
|
||||||
SUMSUB_BA w, 0, 15, 7 ; t0, t15
|
SUMSUB_BA w, 0, 15, 7 ; t0, t15
|
||||||
|
@ -716,6 +764,37 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
|
||||||
mova [rsp+26*%%str], m14 ; t14
|
mova [rsp+26*%%str], m14 ; t14
|
||||||
|
|
||||||
; then, secondly, do t16-31
|
; then, secondly, do t16-31
|
||||||
|
%if %3 <= 8
|
||||||
|
mova m4, [%1+ 1*64]
|
||||||
|
mova m3, [%1+ 3*64]
|
||||||
|
mova m0, [%1+ 5*64]
|
||||||
|
mova m7, [%1+ 7*64]
|
||||||
|
|
||||||
|
pmulhrsw m11, m4, [pw_16364x2] ;t31
|
||||||
|
pmulhrsw m4, [pw_804x2] ;t16
|
||||||
|
pmulhrsw m8, m7, [pw_m5520x2] ;t19
|
||||||
|
pmulhrsw m7, [pw_15426x2] ;t28
|
||||||
|
pmulhrsw m15, m0, [pw_15893x2] ;t27
|
||||||
|
pmulhrsw m0, [pw_3981x2] ;t20
|
||||||
|
pmulhrsw m12, m3, [pw_m2404x2] ;t23
|
||||||
|
pmulhrsw m3, [pw_16207x2] ;t24
|
||||||
|
|
||||||
|
; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
|
||||||
|
; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
|
||||||
|
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 5, 10, 11, 4, 16069, 3196, [pd_8192], 6, 9 ; t17, t30
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 9, 6, 7, 8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
|
||||||
|
; from 1 stage forward
|
||||||
|
SUMSUB_BA w, 8, 4, 1
|
||||||
|
; temporary storage
|
||||||
|
mova [rsp+17*%%str], m8 ; t16
|
||||||
|
mova [rsp+21*%%str], m4 ; t19
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
|
||||||
|
VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
|
||||||
|
|
||||||
|
; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
|
||||||
|
; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
|
||||||
|
%else
|
||||||
mova m10, [%1+ 1*64]
|
mova m10, [%1+ 1*64]
|
||||||
mova m13, [%1+ 3*64]
|
mova m13, [%1+ 3*64]
|
||||||
mova m14, [%1+ 5*64]
|
mova m14, [%1+ 5*64]
|
||||||
|
@ -793,6 +872,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
|
||||||
VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
|
VP9_UNPACK_MULSUB_2W_4X 9, 6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
|
||||||
VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
|
VP9_UNPACK_MULSUB_2W_4X 1, 14, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
|
||||||
VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
|
VP9_UNPACK_MULSUB_2W_4X 13, 2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
|
||||||
|
%endif
|
||||||
|
|
||||||
; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
|
; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
|
||||||
; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
|
; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
|
||||||
|
@ -1029,8 +1109,10 @@ INIT_XMM ssse3
|
||||||
cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
|
cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
|
||||||
cmp eobd, 135
|
cmp eobd, 135
|
||||||
jg .idctfull
|
jg .idctfull
|
||||||
cmp eobd, 1
|
cmp eobd, 34
|
||||||
jg .idct16x16
|
jg .idct16x16
|
||||||
|
cmp eobd, 1
|
||||||
|
jg .idct8x8
|
||||||
|
|
||||||
; dc-only case
|
; dc-only case
|
||||||
movd m0, [blockq]
|
movd m0, [blockq]
|
||||||
|
@ -1050,6 +1132,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
|
||||||
RET
|
RET
|
||||||
|
|
||||||
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
|
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
|
||||||
|
.idct8x8:
|
||||||
|
VP9_IDCT32_1D blockq, 1, 8
|
||||||
|
|
||||||
|
mov stride30q, strideq ; stride
|
||||||
|
lea stride2q, [strideq*2] ; stride*2
|
||||||
|
shl stride30q, 5 ; stride*32
|
||||||
|
mov cntd, 4
|
||||||
|
sub stride30q, stride2q ; stride*30
|
||||||
|
.loop2_8x8:
|
||||||
|
mov dstq, dst_bakq
|
||||||
|
lea dst_endq, [dst_bakq+stride30q]
|
||||||
|
VP9_IDCT32_1D rsp, 2, 8
|
||||||
|
add dst_bakq, 8
|
||||||
|
add rsp, 16
|
||||||
|
dec cntd
|
||||||
|
jg .loop2_8x8
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
; at the end of the loop, m7 should still be zero
|
||||||
|
; use that to zero out block coefficients
|
||||||
|
ZERO_BLOCK blockq, 64, 8, m7
|
||||||
|
RET
|
||||||
|
|
||||||
.idct16x16:
|
.idct16x16:
|
||||||
mov cntd, 2
|
mov cntd, 2
|
||||||
.loop1_16x16:
|
.loop1_16x16:
|
||||||
|
|
Loading…
Reference in New Issue