diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm index 46f5ddb9b6..646acdffe3 100644 --- a/libavcodec/x86/h264_weight.asm +++ b/libavcodec/x86/h264_weight.asm @@ -70,8 +70,8 @@ SECTION .text packuswb m0, m1 %endmacro -INIT_MMX -cglobal h264_weight_16_mmxext, 6, 6, 0 +INIT_MMX mmxext +cglobal h264_weight_16, 6, 6, 0 WEIGHT_SETUP .nextrow: WEIGHT_OP 0, 4 @@ -83,8 +83,8 @@ cglobal h264_weight_16_mmxext, 6, 6, 0 jnz .nextrow REP_RET -%macro WEIGHT_FUNC_MM 3 -cglobal h264_weight_%1_%3, 6, 6, %2 +%macro WEIGHT_FUNC_MM 2 +cglobal h264_weight_%1, 6, 6, %2 WEIGHT_SETUP .nextrow: WEIGHT_OP 0, mmsize/2 @@ -95,13 +95,13 @@ cglobal h264_weight_%1_%3, 6, 6, %2 REP_RET %endmacro -INIT_MMX -WEIGHT_FUNC_MM 8, 0, mmxext -INIT_XMM -WEIGHT_FUNC_MM 16, 8, sse2 +INIT_MMX mmxext +WEIGHT_FUNC_MM 8, 0 +INIT_XMM sse2 +WEIGHT_FUNC_MM 16, 8 -%macro WEIGHT_FUNC_HALF_MM 3 -cglobal h264_weight_%1_%3, 6, 6, %2 +%macro WEIGHT_FUNC_HALF_MM 2 +cglobal h264_weight_%1, 6, 6, %2 WEIGHT_SETUP sar r2d, 1 lea r3, [r1*2] @@ -120,10 +120,10 @@ cglobal h264_weight_%1_%3, 6, 6, %2 REP_RET %endmacro -INIT_MMX -WEIGHT_FUNC_HALF_MM 4, 0, mmxext -INIT_XMM -WEIGHT_FUNC_HALF_MM 8, 8, sse2 +INIT_MMX mmxext +WEIGHT_FUNC_HALF_MM 4, 0 +INIT_XMM sse2 +WEIGHT_FUNC_HALF_MM 8, 8 %macro BIWEIGHT_SETUP 0 %if ARCH_X86_64 @@ -135,12 +135,25 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 add off_regd, 1 or off_regd, 1 add r4, 1 +%if cpuflag(ssse3) + movd m4, r5d + movd m0, r6d +%else movd m3, r5d movd m4, r6d +%endif movd m5, off_regd movd m6, r4d pslld m5, m6 psrld m5, 1 +%if cpuflag(ssse3) + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 + +%else %if mmsize == 16 pshuflw m3, m3, 0 pshuflw m4, m4, 0 @@ -154,6 +167,7 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 pshufw m5, m5, 0 %endif pxor m7, m7 +%endif %endmacro %macro BIWEIGHT_STEPA 3 @@ -174,8 +188,8 @@ WEIGHT_FUNC_HALF_MM 8, 8, sse2 packuswb m0, m1 %endmacro -INIT_MMX -cglobal h264_biweight_16_mmxext, 7, 8, 0 +INIT_MMX mmxext +cglobal h264_biweight_16, 7, 8, 0 BIWEIGHT_SETUP movifnidn r3d, r3m .nextrow: @@ -193,8 +207,8 @@ cglobal h264_biweight_16_mmxext, 7, 8, 0 jnz .nextrow REP_RET -%macro BIWEIGHT_FUNC_MM 3 -cglobal h264_biweight_%1_%3, 7, 8, %2 +%macro BIWEIGHT_FUNC_MM 2 +cglobal h264_biweight_%1, 7, 8, %2 BIWEIGHT_SETUP movifnidn r3d, r3m .nextrow: @@ -209,13 +223,13 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 REP_RET %endmacro -INIT_MMX -BIWEIGHT_FUNC_MM 8, 0, mmxext -INIT_XMM -BIWEIGHT_FUNC_MM 16, 8, sse2 +INIT_MMX mmxext +BIWEIGHT_FUNC_MM 8, 0 +INIT_XMM sse2 +BIWEIGHT_FUNC_MM 16, 8 -%macro BIWEIGHT_FUNC_HALF_MM 3 -cglobal h264_biweight_%1_%3, 7, 8, %2 +%macro BIWEIGHT_FUNC_HALF_MM 2 +cglobal h264_biweight_%1, 7, 8, %2 BIWEIGHT_SETUP movifnidn r3d, r3m sar r3, 1 @@ -238,33 +252,10 @@ cglobal h264_biweight_%1_%3, 7, 8, %2 REP_RET %endmacro -INIT_MMX -BIWEIGHT_FUNC_HALF_MM 4, 0, mmxext -INIT_XMM -BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 - -%macro BIWEIGHT_SSSE3_SETUP 0 -%if ARCH_X86_64 -%define off_regd r7d -%else -%define off_regd r3d -%endif - mov off_regd, r7m - add off_regd, 1 - or off_regd, 1 - add r4, 1 - movd m4, r5d - movd m0, r6d - movd m5, off_regd - movd m6, r4d - pslld m5, m6 - psrld m5, 1 - punpcklbw m4, m0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m4, m4 - punpcklqdq m5, m5 -%endmacro +INIT_MMX mmxext +BIWEIGHT_FUNC_HALF_MM 4, 0 +INIT_XMM sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8 %macro BIWEIGHT_SSSE3_OP 0 pmaddubsw m0, m4 @@ -276,9 +267,9 @@ BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 packuswb m0, m2 %endmacro -INIT_XMM -cglobal h264_biweight_16_ssse3, 7, 8, 8 - BIWEIGHT_SSSE3_SETUP +INIT_XMM ssse3 +cglobal h264_biweight_16, 7, 8, 8 + BIWEIGHT_SETUP movifnidn r3d, r3m .nextrow: @@ -295,9 +286,9 @@ cglobal h264_biweight_16_ssse3, 7, 8, 8 jnz .nextrow REP_RET -INIT_XMM -cglobal h264_biweight_8_ssse3, 7, 8, 8 - BIWEIGHT_SSSE3_SETUP +INIT_XMM ssse3 +cglobal h264_biweight_8, 7, 8, 8 + BIWEIGHT_SETUP movifnidn r3d, r3m sar r3, 1 lea r4, [r2*2]