diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index ebfd963d34..244f5f0f40 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -29,20 +29,20 @@ #if HAVE_YASM -decl_fpel_func(put, 4, mmx); -decl_fpel_func(put, 8, mmx); -decl_fpel_func(put, 16, sse); -decl_fpel_func(put, 32, sse); -decl_fpel_func(put, 64, sse); -decl_fpel_func(avg, 4, mmxext); -decl_fpel_func(avg, 8, mmxext); -decl_fpel_func(avg, 16, sse2); -decl_fpel_func(avg, 32, sse2); -decl_fpel_func(avg, 64, sse2); -decl_fpel_func(put, 32, avx); -decl_fpel_func(put, 64, avx); -decl_fpel_func(avg, 32, avx2); -decl_fpel_func(avg, 64, avx2); +decl_fpel_func(put, 4, , mmx); +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(avg, 4, _8, mmxext); +decl_fpel_func(avg, 8, _8, mmxext); +decl_fpel_func(avg, 16, _8, sse2); +decl_fpel_func(avg, 32, _8, sse2); +decl_fpel_func(avg, 64, _8, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(avg, 32, _8, avx2); +decl_fpel_func(avg, 64, _8, avx2); #define mc_func(avg, sz, dir, opt, type, f_sz) \ void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -378,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) } while (0) if (EXTERNAL_MMX(cpu_flags)) { - init_fpel_func(4, 0, 4, put, mmx); - init_fpel_func(3, 0, 8, put, mmx); + init_fpel_func(4, 0, 4, put, , mmx); + init_fpel_func(3, 0, 8, put, , mmx); if (!bitexact) { dsp->itxfm_add[4 /* lossless */][DCT_DCT] = dsp->itxfm_add[4 /* lossless */][ADST_DCT] = @@ -392,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (EXTERNAL_MMXEXT(cpu_flags)) { init_subpel2(4, 0, 4, put, mmxext); init_subpel2(4, 1, 4, avg, mmxext); - init_fpel_func(4, 1, 4, avg, mmxext); - init_fpel_func(3, 1, 8, avg, mmxext); + init_fpel_func(4, 1, 4, avg, _8, mmxext); + init_fpel_func(3, 1, 8, avg, _8, mmxext); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; init_dc_ipred(4, mmxext); init_dc_ipred(8, mmxext); @@ -401,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) } if (EXTERNAL_SSE(cpu_flags)) { - init_fpel_func(2, 0, 16, put, sse); - init_fpel_func(1, 0, 32, put, sse); - init_fpel_func(0, 0, 64, put, sse); + init_fpel_func(2, 0, 16, put, , sse); + init_fpel_func(1, 0, 32, put, , sse); + init_fpel_func(0, 0, 64, put, , sse); init_ipred(16, sse, v, VERT); init_ipred(32, sse, v, VERT); } @@ -411,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (EXTERNAL_SSE2(cpu_flags)) { init_subpel3_8to64(0, put, sse2); init_subpel3_8to64(1, avg, sse2); - init_fpel_func(2, 1, 16, avg, sse2); - init_fpel_func(1, 1, 32, avg, sse2); - init_fpel_func(0, 1, 64, avg, sse2); + init_fpel_func(2, 1, 16, avg, _8, sse2); + init_fpel_func(1, 1, 32, avg, _8, sse2); + init_fpel_func(0, 1, 64, avg, _8, sse2); init_lpf(sse2); dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; @@ -483,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) init_dir_tm_h_ipred(32, avx); } if (EXTERNAL_AVX_FAST(cpu_flags)) { - init_fpel_func(1, 0, 32, put, avx); - init_fpel_func(0, 0, 64, put, avx); + init_fpel_func(1, 0, 32, put, , avx); + init_fpel_func(0, 0, 64, put, , avx); init_ipred(32, avx, v, VERT); } if (EXTERNAL_AVX2(cpu_flags)) { - init_fpel_func(1, 1, 32, avg, avx2); - init_fpel_func(0, 1, 64, avg, avx2); + init_fpel_func(1, 1, 32, avg, _8, avx2); + init_fpel_func(0, 1, 64, avg, _8, avx2); if (ARCH_X86_64) { #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL init_subpel3_32_64(0, put, avx2); diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 8c99c0d081..792405ee86 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -23,16 +23,16 @@ #ifndef AVCODEC_X86_VP9DSP_INIT_H #define AVCODEC_X86_VP9DSP_INIT_H -#define decl_fpel_func(avg, sz, opt) \ -void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, \ - int h, int mx, int my) +#define decl_fpel_func(avg, sz, bpp, opt) \ +void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) -#define init_fpel_func(idx1, idx2, sz, type, opt) \ +#define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp); diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 6f2c50d04a..daf9cc5d6d 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -29,14 +29,22 @@ #if HAVE_YASM -decl_fpel_func(put, 8, mmx); -decl_fpel_func(put, 16, sse); -decl_fpel_func(put, 32, sse); -decl_fpel_func(put, 64, sse); -decl_fpel_func(put, 128, sse); -decl_fpel_func(put, 32, avx); -decl_fpel_func(put, 64, avx); -decl_fpel_func(put, 128, avx); +decl_fpel_func(put, 8, , mmx); +decl_fpel_func(avg, 8, _16, mmxext); +decl_fpel_func(put, 16, , sse); +decl_fpel_func(put, 32, , sse); +decl_fpel_func(put, 64, , sse); +decl_fpel_func(put, 128, , sse); +decl_fpel_func(avg, 16, _16, sse2); +decl_fpel_func(avg, 32, _16, sse2); +decl_fpel_func(avg, 64, _16, sse2); +decl_fpel_func(avg, 128, _16, sse2); +decl_fpel_func(put, 32, , avx); +decl_fpel_func(put, 64, , avx); +decl_fpel_func(put, 128, , avx); +decl_fpel_func(avg, 32, _16, avx2); +decl_fpel_func(avg, 64, _16, avx2); +decl_fpel_func(avg, 128, _16, avx2); #endif /* HAVE_YASM */ @@ -46,19 +54,37 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - init_fpel_func(4, 0, 8, put, mmx); + init_fpel_func(4, 0, 8, put, , mmx); + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + init_fpel_func(4, 1, 8, avg, _16, mmxext); } if (EXTERNAL_SSE(cpu_flags)) { - init_fpel_func(3, 0, 16, put, sse); - init_fpel_func(2, 0, 32, put, sse); - init_fpel_func(1, 0, 64, put, sse); - init_fpel_func(0, 0, 128, put, sse); + init_fpel_func(3, 0, 16, put, , sse); + init_fpel_func(2, 0, 32, put, , sse); + init_fpel_func(1, 0, 64, put, , sse); + init_fpel_func(0, 0, 128, put, , sse); } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_fpel_func(3, 1, 16, avg, _16, sse2); + init_fpel_func(2, 1, 32, avg, _16, sse2); + init_fpel_func(1, 1, 64, avg, _16, sse2); + init_fpel_func(0, 1, 128, avg, _16, sse2); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { - init_fpel_func(2, 0, 32, put, avx); - init_fpel_func(1, 0, 64, put, avx); - init_fpel_func(0, 0, 128, put, avx); + init_fpel_func(2, 0, 32, put, , avx); + init_fpel_func(1, 0, 64, put, , avx); + init_fpel_func(0, 0, 128, put, , avx); + } + + if (EXTERNAL_AVX2(cpu_flags)) { + init_fpel_func(2, 1, 32, avg, _16, avx2); + init_fpel_func(1, 1, 64, avg, _16, avx2); + init_fpel_func(0, 1, 128, avg, _16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index fb5b1e9d9b..bc61c12841 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -553,7 +553,7 @@ filter_vx2_fn avg %endif ; ARCH_X86_64 -%macro fpel_fn 6-7 4 +%macro fpel_fn 6-8 0, 4 %if %2 == 4 %define %%srcfn movh %define %%dstfn movh @@ -562,12 +562,22 @@ filter_vx2_fn avg %define %%dstfn mova %endif +%if %7 == 8 +%define %%pavg pavgb +%define %%szsuf _8 +%elif %7 == 16 +%define %%pavg pavgw +%define %%szsuf _16 +%else +%define %%szsuf +%endif + %if %2 <= mmsize -cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 +cglobal vp9_%1%2%%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 lea sstride3q, [sstrideq*3] lea dstride3q, [dstrideq*3] %else -cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h +cglobal vp9_%1%2%%szsuf, 5, 5, %8, dst, dstride, src, sstride, h %endif .loop: %%srcfn m0, [srcq] @@ -582,10 +592,16 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h %endif lea srcq, [srcq+sstrideq*%6] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+d%3] - pavgb m2, [dstq+d%4] - pavgb m3, [dstq+d%5] + %%pavg m0, [dstq] + %%pavg m1, [dstq+d%3] + %%pavg m2, [dstq+d%4] + %%pavg m3, [dstq+d%5] +%if %2/mmsize == 8 + %%pavg m4, [dstq+mmsize*4] + %%pavg m5, [dstq+mmsize*5] + %%pavg m6, [dstq+mmsize*6] + %%pavg m7, [dstq+mmsize*7] +%endif %endif %%dstfn [dstq], m0 %%dstfn [dstq+d%3], m1 @@ -611,25 +627,38 @@ INIT_MMX mmx fpel_fn put, 4, strideq, strideq*2, stride3q, 4 fpel_fn put, 8, strideq, strideq*2, stride3q, 4 INIT_MMX mmxext -fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 -fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 +fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 INIT_XMM sse fpel_fn put, 16, strideq, strideq*2, stride3q, 4 fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 -fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 8 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 INIT_XMM sse2 -fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 -fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 -fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 INIT_YMM avx fpel_fn put, 32, strideq, strideq*2, stride3q, 4 fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 -fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 +%endif +INIT_MMX mmxext +fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 +INIT_XMM sse2 +fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 +fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 +fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 %endif %undef s16 %undef d16