diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm index d27b131586..b79ed40d3b 100644 --- a/libavcodec/x86/hevc_deblock.asm +++ b/libavcodec/x86/hevc_deblock.asm @@ -57,11 +57,10 @@ INIT_XMM sse2 punpcklbw m5, m7 punpcklwd m4, m5 - movdqa m2, m0 + punpckhdq m2, m0, m4 punpckldq m0, m4 - punpckhdq m2, m4 - movdqa m1, m0 - movdqa m3, m2 + mova m1, m0 + mova m3, m2 pxor m5, m5 punpcklbw m0, m5 @@ -81,10 +80,8 @@ INIT_XMM sse2 punpcklbw m0, m1 punpcklbw m2, m3 - movdqa m6, m0 - + punpckhwd m6, m0, m2 punpcklwd m0, m2 - punpckhwd m6, m2 movd %1, m0 pshufd m0, m0, 0x39 @@ -113,9 +110,8 @@ INIT_XMM sse2 punpcklwd m0, m2 punpcklwd m1, m3 - movdqa m2, m0 + punpckhdq m2, m0, m1 punpckldq m0, m1 - punpckhdq m2, m1 movq m4, %5 movq m6, %6 @@ -124,16 +120,13 @@ INIT_XMM sse2 punpcklwd m4, m6 punpcklwd m5, m7 - movdqa m6, m4 + punpckhdq m6, m4, m5 punpckldq m4, m5 - punpckhdq m6, m5 - movdqa m1, m0 + punpckhqdq m1, m0, m4 punpcklqdq m0, m4 - punpckhqdq m1, m4 - movdqa m3, m2 + punpckhqdq m3, m2, m6 punpcklqdq m2, m6 - punpckhqdq m3, m6 %endmacro @@ -146,14 +139,13 @@ INIT_XMM sse2 CLIPW m2, m5, [pw_pixel_max] CLIPW m3, m5, [pw_pixel_max] - movdqa m4, m0 - movdqa m5, m2 + mova m4, m0 + mova m5, m2 punpcklwd m0, m1 punpcklwd m2, m3 - movdqa m6, m0 + punpckhdq m6, m0, m2 punpckldq m0, m2 - punpckhdq m6, m2 movq %1, m0 punpckhqdq m0, m0 @@ -164,9 +156,8 @@ INIT_XMM sse2 punpckhwd m4, m1 punpckhwd m5, m3 - movdqa m6, m4 + punpckhdq m6, m4, m5 punpckldq m4, m5 - punpckhdq m6, m5 movq %5, m4 punpckhqdq m4, m4 @@ -186,8 +177,7 @@ INIT_XMM sse2 punpcklbw m7, m2 punpcklbw m1, m3 - movdqa m3, m7 - punpcklwd m3, m1 + punpcklwd m3, m7, m1 punpckhwd m7, m1 movq m4, %5 @@ -197,34 +187,27 @@ INIT_XMM sse2 punpcklbw m4, m6 punpcklbw m5, m15 - movdqa m9, m4 - punpcklwd m9, m5 + punpcklwd m9, m4, m5 punpckhwd m4, m5 - movdqa m1, m3 - punpckldq m1, m9; 0, 1 + punpckldq m1, m3, m9; 0, 1 punpckhdq m3, m9; 2, 3 - movdqa m5, m7 - punpckldq m5, m4; 4, 5 + punpckldq m5, m7, m4; 4, 5 punpckhdq m7, m4; 6, 7 pxor m13, m13 - movdqa m0, m1 - punpcklbw m0, m13; 0 in 16 bit + punpcklbw m0, m1, m13; 0 in 16 bit punpckhbw m1, m13; 1 in 16 bit - movdqa m2, m3; - punpcklbw m2, m13; 2 + punpcklbw m2, m3, m13; 2 punpckhbw m3, m13; 3 - movdqa m4, m5; - punpcklbw m4, m13; 4 + punpcklbw m4, m5, m13; 4 punpckhbw m5, m13; 5 - movdqa m6, m7 - punpcklbw m6, m13; 6 + punpcklbw m6, m7, m13; 6 punpckhbw m7, m13; 7 %endmacro @@ -244,23 +227,19 @@ INIT_XMM sse2 punpcklbw m0, m1 punpcklbw m2, m3 - movdqa m8, m0 + punpckhwd m8, m0, m2 punpcklwd m0, m2 - punpckhwd m8, m2 punpcklbw m4, m5 punpcklbw m6, m7 - movdqa m9, m4 + punpckhwd m9, m4, m6 punpcklwd m4, m6 - punpckhwd m9, m6 - movdqa m10, m0 + punpckhdq m10, m0, m4; 2, 3 punpckldq m0, m4; 0, 1 - punpckhdq m10, m4; 2, 3 - movdqa m11, m8 - punpckldq m11, m9; 4, 5 + punpckldq m11, m8, m9; 4, 5 punpckhdq m8, m9; 6, 7 movq %1, m0 pshufd m0, m0, 0x4E @@ -322,10 +301,10 @@ INIT_XMM sse2 ; clobbers m10 %macro MASKED_COPY 2 pand %2, m11 ; and mask - movdqa m10, m11 + mova m10, m11 pandn m10, %1; and -mask por %2, m10 - movdqa %1, %2 + mova %1, %2 %endmacro ; in: %2 clobbered @@ -335,14 +314,14 @@ INIT_XMM sse2 pand %2, %3 ; and mask pandn %3, %1; and -mask por %2, %3 - movdqa %1, %2 + mova %1, %2 %endmacro ALIGN 16 ; input in m0 ... m3 and tcs in r2. Output in m1 and m2 %macro CHROMA_DEBLOCK_BODY 1 - movdqa m4, m2; temp copy of q0 - movdqa m5, m0; temp copy of p1 + mova m4, m2; temp copy of q0 + mova m5, m0; temp copy of p1 psubw m4, m1; q0 - p0 psubw m5, m3; p1 - q1 psllw m4, 2; << 2 @@ -355,7 +334,7 @@ ALIGN 16 movd m7, [r2]; tc1 punpcklwd m7, m7 shufps m6, m7, 0; tc0, tc1 - movdqa m4, m6 + mova m4, m6 pcmpeqw m7, m7; set all bits to 1 pxor m4, m7; flip all bits of first reg psrlw m7, 15; 1 in every cell @@ -376,19 +355,19 @@ ALIGN 16 ; input in m0 ... m7, betas in r2 tcs in r3. Output in m1...m6 %macro LUMA_DEBLOCK_BODY 2 - movdqa m9, m2 + mova m9, m2 psllw m9, 1; *2 - movdqa m10, m1 + mova m10, m1 psubw m10, m9 paddw m10, m3 - pabsw m10, m10 ; 0dp0, 0dp3 , 1dp0, 1dp3 + ABS1 m10, m10 ; 0dp0, 0dp3 , 1dp0, 1dp3 - movdqa m9, m5 + mova m9, m5 psllw m9, 1; *2 - movdqa m11, m6 + mova m11, m6 psubw m11, m9 paddw m11, m4 - pabsw m11, m11 ; 0dq0, 0dq3 , 1dq0, 1dq3 + ABS1 m11, m11 ; 0dq0, 0dq3 , 1dq0, 1dq3 ;beta calculations mov r11, [betaq]; @@ -403,7 +382,7 @@ ALIGN 16 pshufd m13, m14, 0; beta0, beta1 ;end beta calculations - movdqa m9, m10 + mova m9, m10 paddw m9, m11; 0d0, 0d3 , 1d0, 1d3 pshufhw m14, m9, 0x0f ;0b00001111; 0d3 0d3 0d0 0d0 in high @@ -413,7 +392,7 @@ ALIGN 16 pshuflw m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3 paddw m14, m9; 0d0+0d3, 1d0+1d3 - movdqa m15, m13; beta0, beta1 + mova m15, m13; beta0, beta1 ;compare pcmpgtw m15, m14 @@ -422,9 +401,9 @@ ALIGN 16 je bypasswrite_macro_%2%1 ;weak / strong decision compare to beta_2 - movdqa m15, m13; beta0, beta1 + mova m15, m13; beta0, beta1 psraw m15, 2; beta >> 2 - movdqa m8, m9; + mova m8, m9; psllw m8, 1; pcmpgtw m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2 movmskps r14, m15; @@ -481,19 +460,19 @@ ALIGN 16 je bypasswrite_macro_%2%1 punpcklwd m9, m9 shufps m8, m9, 0; tc0, tc1 - movdqa m9, m8 + mova m9, m8 psllw m8, 2; tc << 2 pavgw m8, m9; tc25 = ((tc * 5 + 1) >> 1) ;end tc25 calculations ;----beta_3 comparison----- - movdqa m12, m0; p3 + mova m12, m0; p3 psubw m12, m3; p3 - p0 - pabsw m12, m12; abs(p3 - p0) + ABS1 m12, m12; abs(p3 - p0) - movdqa m15, m7; q3 + mova m15, m7; q3 psubw m15, m4; q3 - q0 - pabsw m15, m15; abs(q3 - q0) + ABS1 m15, m15; abs(q3 - q0) paddw m12, m15; abs(p3 - p0) + abs(q3 - q0) @@ -506,9 +485,9 @@ ALIGN 16 and r14, r2; strong mask , beta_2 and beta_3 comparisons ;----beta_3 comparison end----- ;----tc25 comparison--- - movdqa m12, m3; p0 + mova m12, m3; p0 psubw m12, m4; p0 - q0 - pabsw m12, m12; abs(p0 - q0) + ABS1 m12, m12; abs(p0 - q0) pshufhw m12, m12, 0xf0 ;0b11110000; pshuflw m12, m12, 0xf0 ;0b11110000; @@ -522,7 +501,7 @@ ALIGN 16 and r14, r2; strong mask, bits 2 and 0 pcmpeqw m13, m13; set all bits to 1 - movdqa m14, m9; tc + mova m14, m9; tc pxor m14, m13; invert bits psrlw m13, 15; 1 in every cell paddw m14, m13; -tc @@ -549,10 +528,10 @@ ALIGN 16 psllw m13, 2; 4 in every cell pand m11, m10; combine filtering mask and strong mask - movdqa m12, m2; p1 + mova m12, m2; p1 paddw m12, m3; p1 + p0 paddw m12, m4; p1 + p0 + q0 - movdqa m10, m12; copy + mova m10, m12; copy psllw m12, 1; 2*p1 + 2*p0 + 2*q0 paddw m12, m1; p2 + 2*p1 + 2*p0 + 2*q0 paddw m12, m5; p2 + 2*p1 + 2*p0 + 2*q0 + q1 @@ -563,7 +542,7 @@ ALIGN 16 pminsw m12, m9; av_clip( , -2 * tc, 2 * tc) paddw m12, m3; p0' - movdqa m15, m1; p2 + mova m15, m1; p2 paddw m15, m10; p2 + p1 + p0 + q0 psrlw m13, 1; 2 in every cell paddw m15, m13; p2 + p1 + p0 + q0 + 2 @@ -573,7 +552,7 @@ ALIGN 16 pminsw m15, m9; av_clip( , -2 * tc, 2 * tc) paddw m15, m2; p1' - movdqa m8, m1; p2 + mova m8, m1; p2 paddw m8, m0; p3 + p2 psllw m8, 1; 2*p3 + 2*p2 paddw m8, m1; 2*p3 + 3*p2 @@ -587,7 +566,7 @@ ALIGN 16 paddw m8, m1; p2' MASKED_COPY m1, m8 - movdqa m8, m3; p0 + mova m8, m3; p0 paddw m8, m4; p0 + q0 paddw m8, m5; p0 + q0 + q1 psllw m8, 1; 2*p0 + 2*q0 + 2*q1 @@ -601,10 +580,10 @@ ALIGN 16 paddw m8, m4; q0' MASKED_COPY m2, m15 - movdqa m15, m3; p0 + mova m15, m3; p0 paddw m15, m4; p0 + q0 paddw m15, m5; p0 + q0 + q1 - movdqa m10, m15; + mova m10, m15; paddw m15, m6; p0 + q0 + q1 + q2 psrlw m13, 1; 2 in every cell paddw m15, m13; p0 + q0 + q1 + q2 + 2 @@ -662,24 +641,24 @@ weakfilter_macro_%2%1: psrlw m13, 15; 1 in every cell psllw m13, 3; 8 in every cell - movdqa m12, m4 ; q0 + mova m12, m4 ; q0 psubw m12, m3 ; q0 - p0 - movdqa m10, m12 + mova m10, m12 psllw m10, 3; 8 * (q0 - p0) paddw m12, m10 ; 9 * (q0 - p0) - movdqa m10, m5 ; q1 + mova m10, m5 ; q1 psubw m10, m2 ; q1 - p1 - movdqa m8, m10 + mova m8, m10 psllw m8, 1; 2 * ( q1 - p1 ) paddw m10, m8; 3 * ( q1 - p1 ) psubw m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 ) paddw m12, m13; + 8 psraw m12, 4; >> 4 , delta0 - pabsw m13, m12; abs(delta0) + PABSW m13, m12; abs(delta0) - movdqa m10, m9; 2*tc + mova m10, m9; 2*tc psllw m10, 2; 8 * tc paddw m10, m9; 10 * tc pcmpgtw m10, m13 @@ -693,12 +672,12 @@ weakfilter_macro_%2%1: pcmpeqw m13, m13; set all bits to 1 psraw m9, 1; tc -> tc / 2 - movdqa m14, m9; + mova m14, m9; pxor m14, m13; complement -tc psrlw m13, 15; set all cells to 1 paddw m14, m13; add 1, -tc / 2 - movdqa m15, m1; p2 + mova m15, m1; p2 pavgw m15, m3; (p2 + p0 + 1) >> 1 psubw m15, m2; ((p2 + p0 + 1) >> 1) - p1 paddw m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0 @@ -719,13 +698,13 @@ weakfilter_macro_%2%1: punpcklwd m8, m8 punpcklwd m13, m13 shufps m13, m8, 0; - movdqa m8, m10; copy of beta + mova m8, m10; copy of beta pcmpgtw m8, m13 pand m8, m11 ;end beta calculations MASKED_COPY2 m2, m15, m8; write p1' - movdqa m8, m6; q2 + mova m8, m6; q2 pavgw m8, m4; (q2 + q0 + 1) >> 1 psubw m8, m5; ((q2 + q0 + 1) >> 1) - q1 psubw m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0) @@ -744,11 +723,11 @@ weakfilter_macro_%2%1: pand m10, m11 MASKED_COPY2 m5, m8, m10; write q1' - movdqa m15, m3 ; p0 + mova m15, m3 ; p0 paddw m15, m12 ; p0 + delta0 MASKED_COPY m3, m15 - movdqa m8, m4 ; q0 + mova m8, m4 ; q0 psubw m8, m12 ; q0 - delta0 MASKED_COPY m4, m8 ready_macro_%2%1: