diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index febaccde11..01e5f18783 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -160,6 +160,7 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \ x86/vp9itxfm.o \ x86/vp9lpf.o \ + x86/vp9lpf_16bpp.o \ x86/vp9mc.o \ x86/vp9mc_16bpp.o YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c index 553dd49d4f..9f3c8b4165 100644 --- a/libavcodec/x86/constants.c +++ b/libavcodec/x86/constants.c @@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x040 0x0400040004000400ULL, 0x0400040004000400ULL}; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL, 0x0800080008000800ULL, 0x0800080008000800ULL }; +DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL, + 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL, 0x1000100010001000ULL, 0x1000100010001000ULL }; DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL, diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h index 33dbb650ae..37a1869641 100644 --- a/libavcodec/x86/constants.h +++ b/libavcodec/x86/constants.h @@ -47,6 +47,7 @@ extern const ymm_reg ff_pw_512; extern const ymm_reg ff_pw_1023; extern const ymm_reg ff_pw_1024; extern const ymm_reg ff_pw_2048; +extern const ymm_reg ff_pw_4095; extern const ymm_reg ff_pw_4096; extern const ymm_reg ff_pw_8192; extern const ymm_reg ff_pw_m1; diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index f48225cf3a..56cd79e7a4 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -65,6 +65,62 @@ filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp) filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp) #endif +#define decl_lpf_func(dir, wd, bpp, opt) \ +void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) + +#define decl_lpf_funcs(dir, wd, bpp) \ +decl_lpf_func(dir, wd, bpp, sse2); \ +decl_lpf_func(dir, wd, bpp, ssse3); \ +decl_lpf_func(dir, wd, bpp, avx) + +#define decl_lpf_funcs_wd(dir) \ +decl_lpf_funcs(dir, 4, BPC); \ +decl_lpf_funcs(dir, 8, BPC); \ +decl_lpf_funcs(dir, 16, BPC) + +decl_lpf_funcs_wd(h); +decl_lpf_funcs_wd(v); + +#define lpf_16_wrapper(dir, off, bpp, opt) \ +static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \ + ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \ +} + +#define lpf_16_wrappers(bpp, opt) \ +lpf_16_wrapper(h, 8 * stride, bpp, opt); \ +lpf_16_wrapper(v, 16, bpp, opt) + +lpf_16_wrappers(BPC, sse2); +lpf_16_wrappers(BPC, ssse3); +lpf_16_wrappers(BPC, avx); + +#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \ +static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \ + int E, int I, int H) \ +{ \ + ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \ + E & 0xff, I & 0xff, H & 0xff); \ + ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \ + E >> 8, I >> 8, H >> 8); \ +} + +#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \ +lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt); \ +lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt) + +#define lpf_mix2_wrappers_set(bpp, opt) \ +lpf_mix2_wrappers(4, 4, bpp, opt); \ +lpf_mix2_wrappers(4, 8, bpp, opt); \ +lpf_mix2_wrappers(8, 4, bpp, opt); \ +lpf_mix2_wrappers(8, 8, bpp, opt); \ + +lpf_mix2_wrappers_set(BPC, sse2); +lpf_mix2_wrappers_set(BPC, ssse3); +lpf_mix2_wrappers_set(BPC, avx); #endif /* HAVE_YASM */ av_cold void INIT_FUNC(VP9DSPContext *dsp) @@ -72,9 +128,43 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp) #if HAVE_YASM int cpu_flags = av_get_cpu_flags(); +#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \ + dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt +#define init_lpf_16_func(idx, dir, bpp, opt) \ + dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt +#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \ + dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt + +#define init_lpf_funcs(bpp, opt) \ + init_lpf_8_func(0, 0, h, 4, bpp, opt); \ + init_lpf_8_func(0, 1, v, 4, bpp, opt); \ + init_lpf_8_func(1, 0, h, 8, bpp, opt); \ + init_lpf_8_func(1, 1, v, 8, bpp, opt); \ + init_lpf_8_func(2, 0, h, 16, bpp, opt); \ + init_lpf_8_func(2, 1, v, 16, bpp, opt); \ + init_lpf_16_func(0, h, bpp, opt); \ + init_lpf_16_func(1, v, bpp, opt); \ + init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \ + init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \ + init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \ + init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \ + init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt) + if (EXTERNAL_SSE2(cpu_flags)) { init_subpel3(0, put, BPC, sse2); init_subpel3(1, avg, BPC, sse2); + init_lpf_funcs(BPC, sse2); + } + + if (EXTERNAL_SSSE3(cpu_flags)) { + init_lpf_funcs(BPC, ssse3); + } + + if (EXTERNAL_AVX(cpu_flags)) { + init_lpf_funcs(BPC, avx); } if (EXTERNAL_AVX2(cpu_flags)) { diff --git a/libavcodec/x86/vp9lpf_16bpp.asm b/libavcodec/x86/vp9lpf_16bpp.asm new file mode 100644 index 0000000000..c15437b8ba --- /dev/null +++ b/libavcodec/x86/vp9lpf_16bpp.asm @@ -0,0 +1,823 @@ +;****************************************************************************** +;* VP9 loop filter SIMD optimizations +;* +;* Copyright (C) 2015 Ronald S. Bultje +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_511: times 16 dw 511 +pw_2047: times 16 dw 2047 +pw_16384: times 16 dw 16384 +pw_m512: times 16 dw -512 +pw_m2048: times 16 dw -2048 + +cextern pw_1 +cextern pw_3 +cextern pw_4 +cextern pw_8 +cextern pw_16 +cextern pw_256 +cextern pw_1023 +cextern pw_4095 +cextern pw_m1 + +SECTION .text + +%macro SCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%if %0 == 4 +%define reg_%4 m%2 +%endif +%else + mova [%3], m%1 +%if %0 == 4 +%define reg_%4 [%3] +%endif +%endif +%endmacro + +%macro UNSCRATCH 3-4 +%if ARCH_X86_64 + SWAP %1, %2 +%else + mova m%1, [%3] +%endif +%if %0 == 4 +%undef reg_%4 +%endif +%endmacro + +%macro PRELOAD 2-3 +%if ARCH_X86_64 + mova m%1, [%2] +%if %0 == 3 +%define reg_%3 m%1 +%endif +%elif %0 == 3 +%define reg_%3 [%2] +%endif +%endmacro + +; calulate p or q portion of flat8out +%macro FLAT8OUT_HALF 0 + psubw m4, m0 ; q4-q0 + psubw m5, m0 ; q5-q0 + psubw m6, m0 ; q6-q0 + psubw m7, m0 ; q7-q0 + ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0) + ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0) + pcmpgtw m4, reg_F ; abs(q4-q0) > F + pcmpgtw m5, reg_F ; abs(q5-q0) > F + pcmpgtw m6, reg_F ; abs(q6-q0) > F + pcmpgtw m7, reg_F ; abs(q7-q0) > F + por m5, m4 + por m7, m6 + por m7, m5 ; !flat8out, q portion +%endmacro + +; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition) +%macro FLAT8IN_HALF 1 +%if %1 > 4 + psubw m4, m3, m0 ; q3-q0 + psubw m5, m2, m0 ; q2-q0 + ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0) + pcmpgtw m4, reg_F ; abs(q3-q0) > F + pcmpgtw m5, reg_F ; abs(q2-q0) > F +%endif + psubw m3, m2 ; q3-q2 + psubw m2, m1 ; q2-q1 + ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1) + pcmpgtw m3, reg_I ; abs(q3-q2) > I + pcmpgtw m2, reg_I ; abs(q2-q1) > I +%if %1 > 4 + por m4, m5 +%endif + por m2, m3 + psubw m3, m1, m0 ; q1-q0 + ABS1 m3, m5 ; abs(q1-q0) +%if %1 > 4 + pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F +%endif + pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H + pcmpgtw m3, reg_I ; abs(q1-q0) > I +%if %1 > 4 + por m4, m6 +%endif + por m2, m3 +%endmacro + +; one step in filter_14/filter_6 +; +; take sum $reg, downshift, apply mask and write into dst +; +; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next +; step's sum $reg. This is omitted for the last row in each filter. +; +; if dont_store is set, don't write the result into memory, instead keep the +; values in register so we can write it out later +%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \ + ; src/sub1, sub2, add1, add2, dont_store + psrlw %1, %2, %4 + psubw %1, %6 ; abs->delta +%ifnidn %7, "" + psubw %2, %6 + psubw %2, %7 + paddw %2, %8 + paddw %2, %9 +%endif + pand %1, reg_%3 ; apply mask +%if %10 == 1 + paddw %6, %1 ; delta->abs +%else + paddw %1, %6 ; delta->abs + mova [%5], %1 +%endif +%endmacro + +; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8} + +%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12] + +%if ARCH_X86_64 +%if %2 == 16 +%assign %%num_xmm_regs 16 +%elif %2 == 8 +%assign %%num_xmm_regs 15 +%else ; %2 == 4 +%assign %%num_xmm_regs 14 +%endif ; %2 +%assign %%bak_mem 0 +%else ; ARCH_X86_32 +%assign %%num_xmm_regs 8 +%if %2 == 16 +%assign %%bak_mem 7 +%elif %2 == 8 +%assign %%bak_mem 6 +%else ; %2 == 4 +%assign %%bak_mem 5 +%endif ; %2 +%endif ; ARCH_X86_64/32 + +%if %2 == 16 +%ifidn %1, v +%assign %%num_gpr_regs 6 +%else ; %1 == h +%assign %%num_gpr_regs 5 +%endif ; %1 +%assign %%wd_mem 6 +%else ; %2 == 8/4 +%assign %%num_gpr_regs 5 +%if ARCH_X86_32 && %2 == 8 +%assign %%wd_mem 2 +%else ; ARCH_X86_64 || %2 == 4 +%assign %%wd_mem 0 +%endif ; ARCH_X86_64/32 etc. +%endif ; %2 + +%ifidn %1, v +%assign %%tsp_mem 0 +%elif %2 == 16 ; && %1 == h +%assign %%tsp_mem 16 +%else ; %1 == h && %1 == 8/4 +%assign %%tsp_mem 8 +%endif ; %1/%2 + +%assign %%off %%wd_mem +%assign %%tspoff %%bak_mem+%%wd_mem +%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize) + +%if %3 == 10 +%define %%maxsgn 511 +%define %%minsgn m512 +%define %%maxusgn 1023 +%define %%maxf 4 +%else ; %3 == 12 +%define %%maxsgn 2047 +%define %%minsgn m2048 +%define %%maxusgn 4095 +%define %%maxf 16 +%endif ; %3 + +cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H + ; prepare E, I and H masks + shl Ed, %3-8 + shl Id, %3-8 + shl Hd, %3-8 +%if cpuflag(ssse3) + mova m0, [pw_256] +%endif + movd m1, Ed + movd m2, Id + movd m3, Hd +%if cpuflag(ssse3) + pshufb m1, m0 ; E << (bit_depth - 8) + pshufb m2, m0 ; I << (bit_depth - 8) + pshufb m3, m0 ; H << (bit_depth - 8) +%else + punpcklwd m1, m1 + punpcklwd m2, m2 + punpcklwd m3, m3 + pshufd m1, m1, q0000 + pshufd m2, m2, q0000 + pshufd m3, m3, q0000 +%endif + SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E + SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I + SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H +%if %2 > 4 + PRELOAD 11, pw_ %+ %%maxf, F +%endif + + ; set up variables to load data +%ifidn %1, v + DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12 + lea stride3q, [strideq*3] + neg strideq +%if %2 == 16 + lea dst0q, [dst8q+strideq*8] +%else + lea dst4q, [dst8q+strideq*4] +%endif + neg strideq +%if %2 == 16 + lea dst12q, [dst8q+strideq*4] + lea dst4q, [dst0q+strideq*4] +%endif + +%if %2 == 16 +%define %%p7 dst0q +%define %%p6 dst0q+strideq +%define %%p5 dst0q+strideq*2 +%define %%p4 dst0q+stride3q +%endif +%define %%p3 dst4q +%define %%p2 dst4q+strideq +%define %%p1 dst4q+strideq*2 +%define %%p0 dst4q+stride3q +%define %%q0 dst8q +%define %%q1 dst8q+strideq +%define %%q2 dst8q+strideq*2 +%define %%q3 dst8q+stride3q +%if %2 == 16 +%define %%q4 dst12q +%define %%q5 dst12q+strideq +%define %%q6 dst12q+strideq*2 +%define %%q7 dst12q+stride3q +%endif +%else ; %1 == h + DEFINE_ARGS dst0, stride, stride3, dst4 + lea stride3q, [strideq*3] + lea dst4q, [dst0q+strideq*4] + +%define %%p3 rsp+(%%tspoff+0)*mmsize +%define %%p2 rsp+(%%tspoff+1)*mmsize +%define %%p1 rsp+(%%tspoff+2)*mmsize +%define %%p0 rsp+(%%tspoff+3)*mmsize +%define %%q0 rsp+(%%tspoff+4)*mmsize +%define %%q1 rsp+(%%tspoff+5)*mmsize +%define %%q2 rsp+(%%tspoff+6)*mmsize +%define %%q3 rsp+(%%tspoff+7)*mmsize + +%if %2 < 16 + movu m0, [dst0q+strideq*0-8] + movu m1, [dst0q+strideq*1-8] + movu m2, [dst0q+strideq*2-8] + movu m3, [dst0q+stride3q -8] + movu m4, [dst4q+strideq*0-8] + movu m5, [dst4q+strideq*1-8] + movu m6, [dst4q+strideq*2-8] + movu m7, [dst4q+stride3q -8] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0] +%endif + + mova [%%p3], m0 + mova [%%p2], m1 + mova [%%p1], m2 + mova [%%p0], m3 +%if ARCH_X86_64 + mova [%%q0], m4 +%endif + mova [%%q1], m5 + mova [%%q2], m6 + mova [%%q3], m7 + + ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register + ; order here accordingly +%else ; %2 == 16 + +%define %%p7 rsp+(%%tspoff+ 8)*mmsize +%define %%p6 rsp+(%%tspoff+ 9)*mmsize +%define %%p5 rsp+(%%tspoff+10)*mmsize +%define %%p4 rsp+(%%tspoff+11)*mmsize +%define %%q4 rsp+(%%tspoff+12)*mmsize +%define %%q5 rsp+(%%tspoff+13)*mmsize +%define %%q6 rsp+(%%tspoff+14)*mmsize +%define %%q7 rsp+(%%tspoff+15)*mmsize + + mova m0, [dst0q+strideq*0-16] + mova m1, [dst0q+strideq*1-16] + mova m2, [dst0q+strideq*2-16] + mova m3, [dst0q+stride3q -16] + mova m4, [dst4q+strideq*0-16] + mova m5, [dst4q+strideq*1-16] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2-16] +%endif + mova m7, [dst4q+stride3q -16] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1 +%endif + + mova [%%p7], m0 + mova [%%p6], m1 + mova [%%p5], m2 + mova [%%p4], m3 +%if ARCH_X86_64 + mova [%%p3], m4 +%endif + mova [%%p2], m5 + mova [%%p1], m6 + mova [%%p0], m7 + + mova m0, [dst0q+strideq*0] + mova m1, [dst0q+strideq*1] + mova m2, [dst0q+strideq*2] + mova m3, [dst0q+stride3q ] + mova m4, [dst4q+strideq*0] + mova m5, [dst4q+strideq*1] +%if ARCH_X86_64 + mova m6, [dst4q+strideq*2] +%endif + mova m7, [dst4q+stride3q ] + +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1 +%endif + + mova [%%q0], m0 + mova [%%q1], m1 + mova [%%q2], m2 + mova [%%q3], m3 +%if ARCH_X86_64 + mova [%%q4], m4 +%endif + mova [%%q5], m5 + mova [%%q6], m6 + mova [%%q7], m7 + + ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register + ; order here accordingly +%endif ; %2 +%endif ; %1 + + ; load q0|q4-7 data + mova m0, [%%q0] +%if %2 == 16 + mova m4, [%%q4] + mova m5, [%%q5] + mova m6, [%%q6] + mova m7, [%%q7] + + ; flat8out q portion + FLAT8OUT_HALF + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; load q1-3 data + mova m1, [%%q1] + mova m2, [%%q2] + mova m3, [%%q3] + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flatout[q] + ; m12-14=free + ; m0-3=q0-q3 + ; m4-7=free + + ; flat8in|fm|hev q portion + FLAT8IN_HALF %2 + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV +%if %2 > 4 + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; m2=!fm[q] + ; m0,1=q0-q1 + ; m2-7=free + ; m12=free + + ; load p0-1 + mova m3, [%%p0] + mova m4, [%%p1] + + ; fm mb_edge portion + psubw m5, m3, m0 ; q0-p0 + psubw m6, m4, m1 ; q1-p1 +%if ARCH_X86_64 + ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1) +%else + ABS1 m5, m7 ; abs(q0-p0) + ABS1 m6, m7 ; abs(q1-p1) +%endif + paddw m5, m5 + psraw m6, 1 + paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1) + pcmpgtw m6, reg_E + por m2, m6 + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out[q] + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m3-4=q0-1 + ; m0-2/5-7=free + + ; load p4-7 data + SWAP 3, 0 ; p0 + SWAP 4, 1 ; p1 +%if %2 == 16 + mova m7, [%%p7] + mova m6, [%%p6] + mova m5, [%%p5] + mova m4, [%%p4] + + ; flat8out p portion + FLAT8OUT_HALF + por m7, reg_F8O + SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O +%endif + + ; r6-8|pw_4[m8-11]=reg_E/I/H/F + ; r9[m15]=!flat8out + ; r10[m13]=hev[q] + ; r11[m14]=!flat8in[q] + ; r12[m12]=!fm[q] + ; m0=p0 + ; m1-7=free + + ; load p2-3 data + mova m2, [%%p2] + mova m3, [%%p3] + + ; flat8in|fm|hev p portion + FLAT8IN_HALF %2 + por m7, reg_HEV +%if %2 > 4 + por m4, reg_F8I +%endif + por m2, reg_FM +%if %2 > 4 + por m4, m2 ; !flat8|!fm +%if %2 == 16 + por m5, m4, reg_F8O ; !flat16|!fm + pandn m2, m4 ; filter4_mask + pandn m4, m5 ; filter8_mask + pxor m5, [pw_m1] ; filter16_mask + SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M +%else + pandn m2, m4 ; filter4_mask + pxor m4, [pw_m1] ; filter8_mask +%endif + SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M +%else + pxor m2, [pw_m1] ; filter4_mask +%endif + SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV + SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M + + ; r9[m15]=filter16_mask + ; r10[m13]=hev + ; r11[m14]=filter8_mask + ; r12[m12]=filter4_mask + ; m0,1=p0-p1 + ; m2-7=free + ; m8-11=free + +%if %2 > 4 +%if %2 == 16 + ; filter_14 + mova m2, [%%p7] + mova m3, [%%p6] + mova m6, [%%p5] + mova m7, [%%p4] + PRELOAD 8, %%p3, P3 + PRELOAD 9, %%p2, P2 +%endif + PRELOAD 10, %%q0, Q0 + PRELOAD 11, %%q1, Q1 +%if %2 == 16 + psllw m4, m2, 3 + paddw m5, m3, m3 + paddw m4, m6 + paddw m5, m7 + paddw m4, reg_P3 + paddw m5, reg_P2 + paddw m4, m1 + paddw m5, m0 + paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8 + psubw m5, m2 ; p0+p2+p4+p6*2-p7 + paddw m4, [pw_8] + paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8 + + ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction + ; at the end of the filter + + mova [rsp+0*mmsize], m3 + FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1 +%endif + mova m3, [%%q2] +%if %2 == 16 + mova [rsp+1*mmsize], m6 + FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3 +%endif + mova m6, [%%q3] +%if %2 == 16 + mova [rsp+2*mmsize], m7 + FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6 + mova m7, [%%q4] +%if ARCH_X86_64 + mova [rsp+3*mmsize], reg_P3 +%else + mova m4, reg_P3 + mova [rsp+3*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7 + PRELOAD 8, %%q5, Q5 +%if ARCH_X86_64 + mova [rsp+4*mmsize], reg_P2 +%else + mova m4, reg_P2 + mova [rsp+4*mmsize], m4 +%endif + FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5 + PRELOAD 9, %%q6, Q6 + mova [rsp+5*mmsize], m1 + FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6 + mova m1, [%%q7] + FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64 + FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1 + FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1 + FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1 + FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1 + FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6 + + mova m7, [%%p1] +%else + SWAP 1, 7 +%endif + + mova m2, [%%p3] + mova m1, [%%p2] + + ; reg_Q0-1 (m10-m11) + ; m0=p0 + ; m1=p2 + ; m2=p3 + ; m3=q2 + ; m4-5=free + ; m6=q3 + ; m7=p1 + ; m8-9 unused + + ; filter_6 + psllw m4, m2, 2 + paddw m5, m1, m1 + paddw m4, m7 + psubw m5, m2 + paddw m4, m0 + paddw m5, reg_Q0 + paddw m4, [pw_4] + paddw m5, m4 + +%if ARCH_X86_64 + mova m8, m1 + mova m9, m7 +%else + mova [rsp+0*mmsize], m1 + mova [rsp+1*mmsize], m7 +%endif +%ifidn %1, v + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1 +%else + FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1 +%endif + FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1 + FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1 +%if ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64 +%else + FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64 + FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64 +%endif + FILTER_STEP m4, m5, F8M, 3, %%q2, m3 + + UNSCRATCH 2, 10, %%q0 + UNSCRATCH 6, 11, %%q1 +%else + SWAP 1, 7 + mova m2, [%%q0] + mova m6, [%%q1] +%endif + UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV + + ; m0=p0 + ; m1=p2 + ; m2=q0 + ; m3=hev_mask + ; m4-5=free + ; m6=q1 + ; m7=p1 + + ; filter_4 + psubw m4, m7, m6 ; p1-q1 + psubw m5, m2, m0 ; q0-p0 + pand m4, m3 + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f + paddw m4, m5 + paddw m5, m5 + paddw m4, m5 ; 3*(q0-p0)+f + pminsw m4, [pw_ %+ %%maxsgn] + pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f + pand m4, reg_F4M + paddw m5, m4, [pw_4] + paddw m4, [pw_3] + pminsw m5, [pw_ %+ %%maxsgn] + pminsw m4, [pw_ %+ %%maxsgn] + psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1 + psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2 + psubw m2, m5 ; q0-f1 + paddw m0, m4 ; p0+f2 + pandn m3, m5 ; f1 & !hev (for p1/q1 adj) + pxor m4, m4 + mova m5, [pw_ %+ %%maxusgn] + pmaxsw m2, m4 + pmaxsw m0, m4 + pminsw m2, m5 + pminsw m0, m5 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_16384] ; (f1+1)>>1 +%else + paddw m3, [pw_1] + psraw m3, 1 +%endif + paddw m7, m3 ; p1+f + psubw m6, m3 ; q1-f + pmaxsw m7, m4 + pmaxsw m6, m4 + pminsw m7, m5 + pminsw m6, m5 + + ; store +%ifidn %1, v + mova [%%p1], m7 + mova [%%p0], m0 + mova [%%q0], m2 + mova [%%q1], m6 +%else ; %1 == h +%if %2 == 4 + TRANSPOSE4x4W 7, 0, 2, 6, 1 + movh [dst0q+strideq*0-4], m7 + movhps [dst0q+strideq*1-4], m7 + movh [dst0q+strideq*2-4], m0 + movhps [dst0q+stride3q -4], m0 + movh [dst4q+strideq*0-4], m2 + movhps [dst4q+strideq*1-4], m2 + movh [dst4q+strideq*2-4], m6 + movhps [dst4q+stride3q -4], m6 +%elif %2 == 8 + mova m3, [%%p3] + mova m4, [%%q2] + mova m5, [%%q3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8 +%else + TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1 + mova m2, [%%q0] +%endif + + movu [dst0q+strideq*0-8], m3 + movu [dst0q+strideq*1-8], m1 + movu [dst0q+strideq*2-8], m7 + movu [dst0q+stride3q -8], m0 + movu [dst4q+strideq*0-8], m2 + movu [dst4q+strideq*1-8], m6 + movu [dst4q+strideq*2-8], m4 + movu [dst4q+stride3q -8], m5 +%else ; %2 == 16 + SCRATCH 2, 8, %%q0 + SCRATCH 6, 9, %%q1 + mova m2, [%%p7] + mova m3, [%%p6] + mova m4, [%%p5] + mova m5, [%%p4] + mova m6, [%%p3] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10 +%else + mova [%%p1], m7 + TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1 +%endif + + mova [dst0q+strideq*0-16], m2 + mova [dst0q+strideq*1-16], m3 + mova [dst0q+strideq*2-16], m4 + mova [dst0q+stride3q -16], m5 +%if ARCH_X86_64 + mova [dst4q+strideq*0-16], m6 +%endif + mova [dst4q+strideq*1-16], m1 + mova [dst4q+strideq*2-16], m7 + mova [dst4q+stride3q -16], m0 + + UNSCRATCH 2, 8, %%q0 + UNSCRATCH 6, 9, %%q1 + mova m0, [%%q2] + mova m1, [%%q3] + mova m3, [%%q4] + mova m4, [%%q5] +%if ARCH_X86_64 + mova m5, [%%q6] +%endif + mova m7, [%%q7] + +%if ARCH_X86_64 + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8 +%else + TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1 +%endif + + mova [dst0q+strideq*0], m2 + mova [dst0q+strideq*1], m6 + mova [dst0q+strideq*2], m0 + mova [dst0q+stride3q ], m1 +%if ARCH_X86_64 + mova [dst4q+strideq*0], m3 +%endif + mova [dst4q+strideq*1], m4 + mova [dst4q+strideq*2], m5 + mova [dst4q+stride3q ], m7 +%endif ; %2 +%endif ; %1 + RET +%endmacro + +%macro LOOP_FILTER_CPUSETS 3 +INIT_XMM sse2 +LOOP_FILTER %1, %2, %3 +INIT_XMM ssse3 +LOOP_FILTER %1, %2, %3 +INIT_XMM avx +LOOP_FILTER %1, %2, %3 +%endmacro + +%macro LOOP_FILTER_WDSETS 2 +LOOP_FILTER_CPUSETS %1, 4, %2 +LOOP_FILTER_CPUSETS %1, 8, %2 +LOOP_FILTER_CPUSETS %1, 16, %2 +%endmacro + +LOOP_FILTER_WDSETS h, 10 +LOOP_FILTER_WDSETS v, 10 +LOOP_FILTER_WDSETS h, 12 +LOOP_FILTER_WDSETS v, 12 diff --git a/libavcodec/x86/vp9mc_16bpp.asm b/libavcodec/x86/vp9mc_16bpp.asm index f60dab7a63..9a462eaf80 100644 --- a/libavcodec/x86/vp9mc_16bpp.asm +++ b/libavcodec/x86/vp9mc_16bpp.asm @@ -24,10 +24,10 @@ SECTION_RODATA 32 -pw_4095: times 16 dw 0xfff pd_64: times 8 dd 64 cextern pw_1023 +cextern pw_4095 SECTION .text