From 6354ff03833b5f64d930c195ae3801cc4061505f Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Wed, 16 Sep 2015 09:08:04 -0400 Subject: [PATCH] vp9: add fullpel (put) MC SIMD for 10/12bpp. --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/vp9dsp_init.c | 74 +++++++++++++----------------- libavcodec/x86/vp9dsp_init.h | 39 ++++++++++++++++ libavcodec/x86/vp9dsp_init_16bpp.c | 65 ++++++++++++++++++++++++++ libavcodec/x86/vp9mc.asm | 18 +++++++- 5 files changed, 155 insertions(+), 44 deletions(-) create mode 100644 libavcodec/x86/vp9dsp_init.h create mode 100644 libavcodec/x86/vp9dsp_init_16bpp.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 3c3cc1cebb..616f83063f 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -62,7 +62,8 @@ OBJS-$(CONFIG_V210_ENCODER) += x86/v210enc_init.o OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o -OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o +OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o \ + x86/vp9dsp_init_16bpp.o OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c index f24cb674fa..ebfd963d34 100644 --- a/libavcodec/x86/vp9dsp_init.c +++ b/libavcodec/x86/vp9dsp_init.c @@ -23,31 +23,26 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/mem.h" -#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" #if HAVE_YASM -#define fpel_func(avg, sz, opt) \ -void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, \ - int h, int mx, int my) -fpel_func(put, 4, mmx); -fpel_func(put, 8, mmx); -fpel_func(put, 16, sse); -fpel_func(put, 32, sse); -fpel_func(put, 64, sse); -fpel_func(avg, 4, mmxext); -fpel_func(avg, 8, mmxext); -fpel_func(avg, 16, sse2); -fpel_func(avg, 32, sse2); -fpel_func(avg, 64, sse2); -fpel_func(put, 32, avx); -fpel_func(put, 64, avx); -fpel_func(avg, 32, avx2); -fpel_func(avg, 64, avx2); -#undef fpel_func +decl_fpel_func(put, 4, mmx); +decl_fpel_func(put, 8, mmx); +decl_fpel_func(put, 16, sse); +decl_fpel_func(put, 32, sse); +decl_fpel_func(put, 64, sse); +decl_fpel_func(avg, 4, mmxext); +decl_fpel_func(avg, 8, mmxext); +decl_fpel_func(avg, 16, sse2); +decl_fpel_func(avg, 32, sse2); +decl_fpel_func(avg, 64, sse2); +decl_fpel_func(put, 32, avx); +decl_fpel_func(put, 64, avx); +decl_fpel_func(avg, 32, avx2); +decl_fpel_func(avg, 64, avx2); #define mc_func(avg, sz, dir, opt, type, f_sz) \ void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -311,16 +306,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) { #if HAVE_YASM int cpu_flags; - if (bpp != 8) return; + if (bpp != 8) { + ff_vp9dsp_init_16bpp_x86(dsp, bpp); + return; + } cpu_flags = av_get_cpu_flags(); -#define init_fpel(idx1, idx2, sz, type, opt) \ - dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ - dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ - dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt - #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ @@ -386,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) } while (0) if (EXTERNAL_MMX(cpu_flags)) { - init_fpel(4, 0, 4, put, mmx); - init_fpel(3, 0, 8, put, mmx); + init_fpel_func(4, 0, 4, put, mmx); + init_fpel_func(3, 0, 8, put, mmx); if (!bitexact) { dsp->itxfm_add[4 /* lossless */][DCT_DCT] = dsp->itxfm_add[4 /* lossless */][ADST_DCT] = @@ -400,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (EXTERNAL_MMXEXT(cpu_flags)) { init_subpel2(4, 0, 4, put, mmxext); init_subpel2(4, 1, 4, avg, mmxext); - init_fpel(4, 1, 4, avg, mmxext); - init_fpel(3, 1, 8, avg, mmxext); + init_fpel_func(4, 1, 4, avg, mmxext); + init_fpel_func(3, 1, 8, avg, mmxext); dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; init_dc_ipred(4, mmxext); init_dc_ipred(8, mmxext); @@ -409,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) } if (EXTERNAL_SSE(cpu_flags)) { - init_fpel(2, 0, 16, put, sse); - init_fpel(1, 0, 32, put, sse); - init_fpel(0, 0, 64, put, sse); + init_fpel_func(2, 0, 16, put, sse); + init_fpel_func(1, 0, 32, put, sse); + init_fpel_func(0, 0, 64, put, sse); init_ipred(16, sse, v, VERT); init_ipred(32, sse, v, VERT); } @@ -419,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) if (EXTERNAL_SSE2(cpu_flags)) { init_subpel3_8to64(0, put, sse2); init_subpel3_8to64(1, avg, sse2); - init_fpel(2, 1, 16, avg, sse2); - init_fpel(1, 1, 32, avg, sse2); - init_fpel(0, 1, 64, avg, sse2); + init_fpel_func(2, 1, 16, avg, sse2); + init_fpel_func(1, 1, 32, avg, sse2); + init_fpel_func(0, 1, 64, avg, sse2); init_lpf(sse2); dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; @@ -491,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) init_dir_tm_h_ipred(32, avx); } if (EXTERNAL_AVX_FAST(cpu_flags)) { - init_fpel(1, 0, 32, put, avx); - init_fpel(0, 0, 64, put, avx); + init_fpel_func(1, 0, 32, put, avx); + init_fpel_func(0, 0, 64, put, avx); init_ipred(32, avx, v, VERT); } if (EXTERNAL_AVX2(cpu_flags)) { - init_fpel(1, 1, 32, avg, avx2); - init_fpel(0, 1, 64, avg, avx2); + init_fpel_func(1, 1, 32, avg, avx2); + init_fpel_func(0, 1, 64, avg, avx2); if (ARCH_X86_64) { #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL init_subpel3_32_64(0, put, avx2); diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h new file mode 100644 index 0000000000..8c99c0d081 --- /dev/null +++ b/libavcodec/x86/vp9dsp_init.h @@ -0,0 +1,39 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_VP9DSP_INIT_H +#define AVCODEC_X86_VP9DSP_INIT_H + +#define decl_fpel_func(avg, sz, opt) \ +void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, \ + int h, int mx, int my) + +#define init_fpel_func(idx1, idx2, sz, type, opt) \ + dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ + dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ + dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt + +void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp); + +#endif /* AVCODEC_X86_VP9DSP_INIT_H */ diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c new file mode 100644 index 0000000000..6f2c50d04a --- /dev/null +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -0,0 +1,65 @@ +/* + * VP9 SIMD optimizations + * + * Copyright (c) 2013 Ronald S. Bultje + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/vp9dsp.h" +#include "libavcodec/x86/vp9dsp_init.h" + +#if HAVE_YASM + +decl_fpel_func(put, 8, mmx); +decl_fpel_func(put, 16, sse); +decl_fpel_func(put, 32, sse); +decl_fpel_func(put, 64, sse); +decl_fpel_func(put, 128, sse); +decl_fpel_func(put, 32, avx); +decl_fpel_func(put, 64, avx); +decl_fpel_func(put, 128, avx); + +#endif /* HAVE_YASM */ + +av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel_func(4, 0, 8, put, mmx); + } + + if (EXTERNAL_SSE(cpu_flags)) { + init_fpel_func(3, 0, 16, put, sse); + init_fpel_func(2, 0, 32, put, sse); + init_fpel_func(1, 0, 64, put, sse); + init_fpel_func(0, 0, 128, put, sse); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { + init_fpel_func(2, 0, 32, put, avx); + init_fpel_func(1, 0, 64, put, avx); + init_fpel_func(0, 0, 128, put, avx); + } + +#endif /* HAVE_YASM */ +} diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 53939579fc..fb5b1e9d9b 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -553,7 +553,7 @@ filter_vx2_fn avg %endif ; ARCH_X86_64 -%macro fpel_fn 6 +%macro fpel_fn 6-7 4 %if %2 == 4 %define %%srcfn movh %define %%dstfn movh @@ -567,13 +567,19 @@ cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 lea sstride3q, [sstrideq*3] lea dstride3q, [dstrideq*3] %else -cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h +cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h %endif .loop: %%srcfn m0, [srcq] %%srcfn m1, [srcq+s%3] %%srcfn m2, [srcq+s%4] %%srcfn m3, [srcq+s%5] +%if %2/mmsize == 8 + %%srcfn m4, [srcq+mmsize*4] + %%srcfn m5, [srcq+mmsize*5] + %%srcfn m6, [srcq+mmsize*6] + %%srcfn m7, [srcq+mmsize*7] +%endif lea srcq, [srcq+sstrideq*%6] %ifidn %1, avg pavgb m0, [dstq] @@ -585,6 +591,12 @@ cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h %%dstfn [dstq+d%3], m1 %%dstfn [dstq+d%4], m2 %%dstfn [dstq+d%5], m3 +%if %2/mmsize == 8 + %%dstfn [dstq+mmsize*4], m4 + %%dstfn [dstq+mmsize*5], m5 + %%dstfn [dstq+mmsize*6], m6 + %%dstfn [dstq+mmsize*7], m7 +%endif lea dstq, [dstq+dstrideq*%6] sub hd, %6 jnz .loop @@ -605,6 +617,7 @@ INIT_XMM sse fpel_fn put, 16, strideq, strideq*2, stride3q, 4 fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 8 INIT_XMM sse2 fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 @@ -612,6 +625,7 @@ fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 INIT_YMM avx fpel_fn put, 32, strideq, strideq*2, stride3q, 4 fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 +fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4