avcodec/x86/hevc: updated to use x86util macros

Reviewed-by: James Almer <jamrial@gmail.com>
Reviewed-by: Ronald S. Bultje
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
plepere 2014-05-16 11:44:20 +02:00 committed by Michael Niedermayer
parent de7b89fd43
commit ef7c4cd001
1 changed files with 67 additions and 88 deletions

View File

@ -57,11 +57,10 @@ INIT_XMM sse2
punpcklbw m5, m7
punpcklwd m4, m5
movdqa m2, m0
punpckhdq m2, m0, m4
punpckldq m0, m4
punpckhdq m2, m4
movdqa m1, m0
movdqa m3, m2
mova m1, m0
mova m3, m2
pxor m5, m5
punpcklbw m0, m5
@ -81,10 +80,8 @@ INIT_XMM sse2
punpcklbw m0, m1
punpcklbw m2, m3
movdqa m6, m0
punpckhwd m6, m0, m2
punpcklwd m0, m2
punpckhwd m6, m2
movd %1, m0
pshufd m0, m0, 0x39
@ -113,9 +110,8 @@ INIT_XMM sse2
punpcklwd m0, m2
punpcklwd m1, m3
movdqa m2, m0
punpckhdq m2, m0, m1
punpckldq m0, m1
punpckhdq m2, m1
movq m4, %5
movq m6, %6
@ -124,16 +120,13 @@ INIT_XMM sse2
punpcklwd m4, m6
punpcklwd m5, m7
movdqa m6, m4
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhdq m6, m5
movdqa m1, m0
punpckhqdq m1, m0, m4
punpcklqdq m0, m4
punpckhqdq m1, m4
movdqa m3, m2
punpckhqdq m3, m2, m6
punpcklqdq m2, m6
punpckhqdq m3, m6
%endmacro
@ -146,14 +139,13 @@ INIT_XMM sse2
CLIPW m2, m5, [pw_pixel_max]
CLIPW m3, m5, [pw_pixel_max]
movdqa m4, m0
movdqa m5, m2
mova m4, m0
mova m5, m2
punpcklwd m0, m1
punpcklwd m2, m3
movdqa m6, m0
punpckhdq m6, m0, m2
punpckldq m0, m2
punpckhdq m6, m2
movq %1, m0
punpckhqdq m0, m0
@ -164,9 +156,8 @@ INIT_XMM sse2
punpckhwd m4, m1
punpckhwd m5, m3
movdqa m6, m4
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhdq m6, m5
movq %5, m4
punpckhqdq m4, m4
@ -186,8 +177,7 @@ INIT_XMM sse2
punpcklbw m7, m2
punpcklbw m1, m3
movdqa m3, m7
punpcklwd m3, m1
punpcklwd m3, m7, m1
punpckhwd m7, m1
movq m4, %5
@ -197,34 +187,27 @@ INIT_XMM sse2
punpcklbw m4, m6
punpcklbw m5, m15
movdqa m9, m4
punpcklwd m9, m5
punpcklwd m9, m4, m5
punpckhwd m4, m5
movdqa m1, m3
punpckldq m1, m9; 0, 1
punpckldq m1, m3, m9; 0, 1
punpckhdq m3, m9; 2, 3
movdqa m5, m7
punpckldq m5, m4; 4, 5
punpckldq m5, m7, m4; 4, 5
punpckhdq m7, m4; 6, 7
pxor m13, m13
movdqa m0, m1
punpcklbw m0, m13; 0 in 16 bit
punpcklbw m0, m1, m13; 0 in 16 bit
punpckhbw m1, m13; 1 in 16 bit
movdqa m2, m3;
punpcklbw m2, m13; 2
punpcklbw m2, m3, m13; 2
punpckhbw m3, m13; 3
movdqa m4, m5;
punpcklbw m4, m13; 4
punpcklbw m4, m5, m13; 4
punpckhbw m5, m13; 5
movdqa m6, m7
punpcklbw m6, m13; 6
punpcklbw m6, m7, m13; 6
punpckhbw m7, m13; 7
%endmacro
@ -244,23 +227,19 @@ INIT_XMM sse2
punpcklbw m0, m1
punpcklbw m2, m3
movdqa m8, m0
punpckhwd m8, m0, m2
punpcklwd m0, m2
punpckhwd m8, m2
punpcklbw m4, m5
punpcklbw m6, m7
movdqa m9, m4
punpckhwd m9, m4, m6
punpcklwd m4, m6
punpckhwd m9, m6
movdqa m10, m0
punpckhdq m10, m0, m4; 2, 3
punpckldq m0, m4; 0, 1
punpckhdq m10, m4; 2, 3
movdqa m11, m8
punpckldq m11, m9; 4, 5
punpckldq m11, m8, m9; 4, 5
punpckhdq m8, m9; 6, 7
movq %1, m0
pshufd m0, m0, 0x4E
@ -322,10 +301,10 @@ INIT_XMM sse2
; clobbers m10
%macro MASKED_COPY 2
pand %2, m11 ; and mask
movdqa m10, m11
mova m10, m11
pandn m10, %1; and -mask
por %2, m10
movdqa %1, %2
mova %1, %2
%endmacro
; in: %2 clobbered
@ -335,14 +314,14 @@ INIT_XMM sse2
pand %2, %3 ; and mask
pandn %3, %1; and -mask
por %2, %3
movdqa %1, %2
mova %1, %2
%endmacro
ALIGN 16
; input in m0 ... m3 and tcs in r2. Output in m1 and m2
%macro CHROMA_DEBLOCK_BODY 1
movdqa m4, m2; temp copy of q0
movdqa m5, m0; temp copy of p1
mova m4, m2; temp copy of q0
mova m5, m0; temp copy of p1
psubw m4, m1; q0 - p0
psubw m5, m3; p1 - q1
psllw m4, 2; << 2
@ -355,7 +334,7 @@ ALIGN 16
movd m7, [r2]; tc1
punpcklwd m7, m7
shufps m6, m7, 0; tc0, tc1
movdqa m4, m6
mova m4, m6
pcmpeqw m7, m7; set all bits to 1
pxor m4, m7; flip all bits of first reg
psrlw m7, 15; 1 in every cell
@ -376,19 +355,19 @@ ALIGN 16
; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6
%macro LUMA_DEBLOCK_BODY 2
movdqa m9, m2
mova m9, m2
psllw m9, 1; *2
movdqa m10, m1
mova m10, m1
psubw m10, m9
paddw m10, m3
pabsw m10, m10 ; 0dp0, 0dp3 , 1dp0, 1dp3
ABS1 m10, m10 ; 0dp0, 0dp3 , 1dp0, 1dp3
movdqa m9, m5
mova m9, m5
psllw m9, 1; *2
movdqa m11, m6
mova m11, m6
psubw m11, m9
paddw m11, m4
pabsw m11, m11 ; 0dq0, 0dq3 , 1dq0, 1dq3
ABS1 m11, m11 ; 0dq0, 0dq3 , 1dq0, 1dq3
;beta calculations
mov r11, [betaq];
@ -403,7 +382,7 @@ ALIGN 16
pshufd m13, m14, 0; beta0, beta1
;end beta calculations
movdqa m9, m10
mova m9, m10
paddw m9, m11; 0d0, 0d3 , 1d0, 1d3
pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high
@ -413,7 +392,7 @@ ALIGN 16
pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3
paddw m14, m9; 0d0+0d3, 1d0+1d3
movdqa m15, m13; beta0, beta1
mova m15, m13; beta0, beta1
;compare
pcmpgtw m15, m14
@ -422,9 +401,9 @@ ALIGN 16
je bypasswrite_macro_%2%1
;weak / strong decision compare to beta_2
movdqa m15, m13; beta0, beta1
mova m15, m13; beta0, beta1
psraw m15, 2; beta >> 2
movdqa m8, m9;
mova m8, m9;
psllw m8, 1;
pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
movmskps r14, m15;
@ -481,19 +460,19 @@ ALIGN 16
je bypasswrite_macro_%2%1
punpcklwd m9, m9
shufps m8, m9, 0; tc0, tc1
movdqa m9, m8
mova m9, m8
psllw m8, 2; tc << 2
pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1)
;end tc25 calculations
;----beta_3 comparison-----
movdqa m12, m0; p3
mova m12, m0; p3
psubw m12, m3; p3 - p0
pabsw m12, m12; abs(p3 - p0)
ABS1 m12, m12; abs(p3 - p0)
movdqa m15, m7; q3
mova m15, m7; q3
psubw m15, m4; q3 - q0
pabsw m15, m15; abs(q3 - q0)
ABS1 m15, m15; abs(q3 - q0)
paddw m12, m15; abs(p3 - p0) + abs(q3 - q0)
@ -506,9 +485,9 @@ ALIGN 16
and r14, r2; strong mask , beta_2 and beta_3 comparisons
;----beta_3 comparison end-----
;----tc25 comparison---
movdqa m12, m3; p0
mova m12, m3; p0
psubw m12, m4; p0 - q0
pabsw m12, m12; abs(p0 - q0)
ABS1 m12, m12; abs(p0 - q0)
pshufhw m12, m12, 0xf0 ;0b11110000;
pshuflw m12, m12, 0xf0 ;0b11110000;
@ -522,7 +501,7 @@ ALIGN 16
and r14, r2; strong mask, bits 2 and 0
pcmpeqw m13, m13; set all bits to 1
movdqa m14, m9; tc
mova m14, m9; tc
pxor m14, m13; invert bits
psrlw m13, 15; 1 in every cell
paddw m14, m13; -tc
@ -549,10 +528,10 @@ ALIGN 16
psllw m13, 2; 4 in every cell
pand m11, m10; combine filtering mask and strong mask
movdqa m12, m2; p1
mova m12, m2; p1
paddw m12, m3; p1 + p0
paddw m12, m4; p1 + p0 + q0
movdqa m10, m12; copy
mova m10, m12; copy
psllw m12, 1; 2*p1 + 2*p0 + 2*q0
paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0
paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1
@ -563,7 +542,7 @@ ALIGN 16
pminsw m12, m9; av_clip( , -2 * tc, 2 * tc)
paddw m12, m3; p0'
movdqa m15, m1; p2
mova m15, m1; p2
paddw m15, m10; p2 + p1 + p0 + q0
psrlw m13, 1; 2 in every cell
paddw m15, m13; p2 + p1 + p0 + q0 + 2
@ -573,7 +552,7 @@ ALIGN 16
pminsw m15, m9; av_clip( , -2 * tc, 2 * tc)
paddw m15, m2; p1'
movdqa m8, m1; p2
mova m8, m1; p2
paddw m8, m0; p3 + p2
psllw m8, 1; 2*p3 + 2*p2
paddw m8, m1; 2*p3 + 3*p2
@ -587,7 +566,7 @@ ALIGN 16
paddw m8, m1; p2'
MASKED_COPY m1, m8
movdqa m8, m3; p0
mova m8, m3; p0
paddw m8, m4; p0 + q0
paddw m8, m5; p0 + q0 + q1
psllw m8, 1; 2*p0 + 2*q0 + 2*q1
@ -601,10 +580,10 @@ ALIGN 16
paddw m8, m4; q0'
MASKED_COPY m2, m15
movdqa m15, m3; p0
mova m15, m3; p0
paddw m15, m4; p0 + q0
paddw m15, m5; p0 + q0 + q1
movdqa m10, m15;
mova m10, m15;
paddw m15, m6; p0 + q0 + q1 + q2
psrlw m13, 1; 2 in every cell
paddw m15, m13; p0 + q0 + q1 + q2 + 2
@ -662,24 +641,24 @@ weakfilter_macro_%2%1:
psrlw m13, 15; 1 in every cell
psllw m13, 3; 8 in every cell
movdqa m12, m4 ; q0
mova m12, m4 ; q0
psubw m12, m3 ; q0 - p0
movdqa m10, m12
mova m10, m12
psllw m10, 3; 8 * (q0 - p0)
paddw m12, m10 ; 9 * (q0 - p0)
movdqa m10, m5 ; q1
mova m10, m5 ; q1
psubw m10, m2 ; q1 - p1
movdqa m8, m10
mova m8, m10
psllw m8, 1; 2 * ( q1 - p1 )
paddw m10, m8; 3 * ( q1 - p1 )
psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
paddw m12, m13; + 8
psraw m12, 4; >> 4 , delta0
pabsw m13, m12; abs(delta0)
PABSW m13, m12; abs(delta0)
movdqa m10, m9; 2*tc
mova m10, m9; 2*tc
psllw m10, 2; 8 * tc
paddw m10, m9; 10 * tc
pcmpgtw m10, m13
@ -693,12 +672,12 @@ weakfilter_macro_%2%1:
pcmpeqw m13, m13; set all bits to 1
psraw m9, 1; tc -> tc / 2
movdqa m14, m9;
mova m14, m9;
pxor m14, m13; complement -tc
psrlw m13, 15; set all cells to 1
paddw m14, m13; add 1, -tc / 2
movdqa m15, m1; p2
mova m15, m1; p2
pavgw m15, m3; (p2 + p0 + 1) >> 1
psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1
paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
@ -719,13 +698,13 @@ weakfilter_macro_%2%1:
punpcklwd m8, m8
punpcklwd m13, m13
shufps m13, m8, 0;
movdqa m8, m10; copy of beta
mova m8, m10; copy of beta
pcmpgtw m8, m13
pand m8, m11
;end beta calculations
MASKED_COPY2 m2, m15, m8; write p1'
movdqa m8, m6; q2
mova m8, m6; q2
pavgw m8, m4; (q2 + q0 + 1) >> 1
psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1
psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
@ -744,11 +723,11 @@ weakfilter_macro_%2%1:
pand m10, m11
MASKED_COPY2 m5, m8, m10; write q1'
movdqa m15, m3 ; p0
mova m15, m3 ; p0
paddw m15, m12 ; p0 + delta0
MASKED_COPY m3, m15
movdqa m8, m4 ; q0
mova m8, m4 ; q0
psubw m8, m12 ; q0 - delta0
MASKED_COPY m4, m8
ready_macro_%2%1: