diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index ced23ceffa..15baaff646 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -187,6 +187,12 @@ void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri lpf_funcs(16, 16, sse2); lpf_funcs(16, 16, ssse3); lpf_funcs(16, 16, avx); +lpf_funcs(84, 16, sse2); +lpf_funcs(84, 16, ssse3); +lpf_funcs(84, 16, avx); +lpf_funcs(48, 16, sse2); +lpf_funcs(48, 16, ssse3); +lpf_funcs(48, 16, avx); lpf_funcs(88, 16, sse2); lpf_funcs(88, 16, ssse3); lpf_funcs(88, 16, avx); @@ -224,6 +230,19 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_subpel2(idx, 0, 1, v, type, opt); \ init_subpel2(idx, 1, 0, h, type, opt) +#define init_lpf(opt) do { \ + if (ARCH_X86_64) { \ + dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ + dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ + dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ + dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ + dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ + dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ + dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ + dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ + } \ +} while (0) + if (EXTERNAL_MMX(cpu_flags)) { init_fpel(4, 0, 4, put, mmx); init_fpel(3, 0, 8, put, mmx); @@ -248,12 +267,7 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) init_fpel(2, 1, 16, avg, sse2); init_fpel(1, 1, 32, avg, sse2); init_fpel(0, 1, 64, avg, sse2); - if (ARCH_X86_64) { - dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2; - dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2; - dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2; - dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2; - } + init_lpf(sse2); } if (EXTERNAL_SSSE3(cpu_flags)) { @@ -276,11 +290,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; - dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_ssse3; - dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_ssse3; - dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3; - dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3; } + init_lpf(ssse3); } if (EXTERNAL_AVX(cpu_flags)) { @@ -297,11 +308,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) dsp->itxfm_add[TX_32X32][ADST_DCT] = dsp->itxfm_add[TX_32X32][DCT_ADST] = dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx; - dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_avx; - dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_avx; - dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_avx; - dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_avx; } + init_lpf(avx); } #undef init_fpel diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm index d187b28ce6..eeab2cb3db 100644 --- a/libavcodec/x86/vp9lpf.asm +++ b/libavcodec/x86/vp9lpf.asm @@ -44,6 +44,11 @@ pw_8: times 8 dw 8 mask_mix: times 8 db 0 times 8 db 1 +mask_mix84: times 8 db 0xff + times 8 db 0x00 +mask_mix48: times 8 db 0x00 + times 8 db 0xff + SECTION .text ; %1 = abs(%2-%3) @@ -324,7 +329,7 @@ SECTION .text neg mstride3q %ifidn %1, h -%if %2 == 88 +%if %2 > 16 %define movx movh lea dstq, [dstq + 8*strideq - 4] %else @@ -372,7 +377,7 @@ SECTION .text %define Q6 rsp + 224 %define Q7 rsp + 240 -%if %2 != 88 +%if %2 == 16 TRANSPOSE16x16B 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, [rsp] mova [P7], m0 mova [P6], m1 @@ -389,7 +394,7 @@ SECTION .text mova [Q1], m9 mova [Q2], m10 mova [Q3], m11 -%if %2 != 88 +%if %2 == 16 mova [Q4], m12 mova [Q5], m13 mova [Q6], m14 @@ -404,7 +409,7 @@ SECTION .text %endif SPLATB_REG m2, I, m0 ; I I I I ... SPLATB_REG m3, E, m0 ; E E E E ... -%elif %2 == 88 +%else %if cpuflag(ssse3) mova m0, [mask_mix] %endif @@ -462,7 +467,7 @@ SECTION .text ABSSUB_CMP m1, m9, m11, m6, m4, m5, m8 ; abs(p2 - p0) <= 1 pand m2, m1 ABSSUB m4, m10, m11, m5 ; abs(p1 - p0) -%if %2 != 88 +%if %2 == 16 %if cpuflag(ssse3) pxor m0, m0 %endif @@ -490,8 +495,11 @@ SECTION .text pand m2, m1 ABSSUB_CMP m1, m15, m12, m6, m4, m5, m8 ; abs(q3 - q0) <= 1 pand m2, m1 ; flat8in final value +%if %2 == 84 || %2 == 48 + pand m2, [mask_mix%2] +%endif -%if %2 != 88 +%if %2 == 16 ; (m0: hev, m2: flat8in, m3: fm, m6: pb_81, m9..15: p2 p1 p0 q0 q1 q2 q3) ; calc flat8out mask mova m8, [P7] @@ -584,7 +592,7 @@ SECTION .text ; ([m1: flat8out], m2: flat8in, m3: fm, m10..13: p1 p0 q0 q1) ; filter6() pxor m0, m0 -%if %2 == 88 +%if %2 > 16 pand m3, m2 %else pand m2, m3 ; mask(fm) & mask(in) @@ -622,7 +630,7 @@ SECTION .text ; q5 +5 -p2 -q4 +q5 +q7 . q5 . . ; q6 +6 -p1 -q5 +q6 +q7 . q6 . . -%if %2 != 88 +%if %2 == 16 pand m1, m2 ; mask(out) & (mask(fm) & mask(in)) mova m2, [P7] mova m3, [P6] @@ -645,7 +653,7 @@ SECTION .text %endif %ifidn %1, h -%if %2 != 88 +%if %2 == 16 mova m0, [P7] mova m1, [P6] mova m2, [P5] @@ -753,28 +761,23 @@ SECTION .text RET %endmacro -%macro LPF_16_16_VH 1 -INIT_XMM %1 -cglobal vp9_loop_filter_v_16_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 - LOOPFILTER v, 16 -cglobal vp9_loop_filter_h_16_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 - LOOPFILTER h, 16 +%macro LPF_16_VH 2 +INIT_XMM %2 +cglobal vp9_loop_filter_v_%1_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 + LOOPFILTER v, %1 +cglobal vp9_loop_filter_h_%1_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 + LOOPFILTER h, %1 %endmacro -%macro LPF_88_16_VH 1 -INIT_XMM %1 -cglobal vp9_loop_filter_v_88_16, 5,10,16, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 - LOOPFILTER v, 88 -cglobal vp9_loop_filter_h_88_16, 5,10,16, 256, dst, stride, E, I, H, mstride, dst1, dst2, stride3, mstride3 - LOOPFILTER h, 88 +%macro LPF_16_VH_ALL_OPTS 1 +LPF_16_VH %1, sse2 +LPF_16_VH %1, ssse3 +LPF_16_VH %1, avx %endmacro -LPF_16_16_VH sse2 -LPF_16_16_VH ssse3 -LPF_16_16_VH avx - -LPF_88_16_VH sse2 -LPF_88_16_VH ssse3 -LPF_88_16_VH avx +LPF_16_VH_ALL_OPTS 16 +LPF_16_VH_ALL_OPTS 48 +LPF_16_VH_ALL_OPTS 84 +LPF_16_VH_ALL_OPTS 88 %endif ; x86-64