vp9: add fullpel (avg) MC SIMD for 10/12bpp.

This commit is contained in:
Ronald S. Bultje 2015-09-16 09:12:27 -04:00
parent 6354ff0383
commit 77f359670f
4 changed files with 120 additions and 65 deletions

View File

@ -29,20 +29,20 @@
#if HAVE_YASM #if HAVE_YASM
decl_fpel_func(put, 4, mmx); decl_fpel_func(put, 4, , mmx);
decl_fpel_func(put, 8, mmx); decl_fpel_func(put, 8, , mmx);
decl_fpel_func(put, 16, sse); decl_fpel_func(put, 16, , sse);
decl_fpel_func(put, 32, sse); decl_fpel_func(put, 32, , sse);
decl_fpel_func(put, 64, sse); decl_fpel_func(put, 64, , sse);
decl_fpel_func(avg, 4, mmxext); decl_fpel_func(avg, 4, _8, mmxext);
decl_fpel_func(avg, 8, mmxext); decl_fpel_func(avg, 8, _8, mmxext);
decl_fpel_func(avg, 16, sse2); decl_fpel_func(avg, 16, _8, sse2);
decl_fpel_func(avg, 32, sse2); decl_fpel_func(avg, 32, _8, sse2);
decl_fpel_func(avg, 64, sse2); decl_fpel_func(avg, 64, _8, sse2);
decl_fpel_func(put, 32, avx); decl_fpel_func(put, 32, , avx);
decl_fpel_func(put, 64, avx); decl_fpel_func(put, 64, , avx);
decl_fpel_func(avg, 32, avx2); decl_fpel_func(avg, 32, _8, avx2);
decl_fpel_func(avg, 64, avx2); decl_fpel_func(avg, 64, _8, avx2);
#define mc_func(avg, sz, dir, opt, type, f_sz) \ #define mc_func(avg, sz, dir, opt, type, f_sz) \
void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
@ -378,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
} while (0) } while (0)
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 4, put, mmx); init_fpel_func(4, 0, 4, put, , mmx);
init_fpel_func(3, 0, 8, put, mmx); init_fpel_func(3, 0, 8, put, , mmx);
if (!bitexact) { if (!bitexact) {
dsp->itxfm_add[4 /* lossless */][DCT_DCT] = dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
dsp->itxfm_add[4 /* lossless */][ADST_DCT] = dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
@ -392,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
if (EXTERNAL_MMXEXT(cpu_flags)) { if (EXTERNAL_MMXEXT(cpu_flags)) {
init_subpel2(4, 0, 4, put, mmxext); init_subpel2(4, 0, 4, put, mmxext);
init_subpel2(4, 1, 4, avg, mmxext); init_subpel2(4, 1, 4, avg, mmxext);
init_fpel_func(4, 1, 4, avg, mmxext); init_fpel_func(4, 1, 4, avg, _8, mmxext);
init_fpel_func(3, 1, 8, avg, mmxext); init_fpel_func(3, 1, 8, avg, _8, mmxext);
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
init_dc_ipred(4, mmxext); init_dc_ipred(4, mmxext);
init_dc_ipred(8, mmxext); init_dc_ipred(8, mmxext);
@ -401,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
} }
if (EXTERNAL_SSE(cpu_flags)) { if (EXTERNAL_SSE(cpu_flags)) {
init_fpel_func(2, 0, 16, put, sse); init_fpel_func(2, 0, 16, put, , sse);
init_fpel_func(1, 0, 32, put, sse); init_fpel_func(1, 0, 32, put, , sse);
init_fpel_func(0, 0, 64, put, sse); init_fpel_func(0, 0, 64, put, , sse);
init_ipred(16, sse, v, VERT); init_ipred(16, sse, v, VERT);
init_ipred(32, sse, v, VERT); init_ipred(32, sse, v, VERT);
} }
@ -411,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
if (EXTERNAL_SSE2(cpu_flags)) { if (EXTERNAL_SSE2(cpu_flags)) {
init_subpel3_8to64(0, put, sse2); init_subpel3_8to64(0, put, sse2);
init_subpel3_8to64(1, avg, sse2); init_subpel3_8to64(1, avg, sse2);
init_fpel_func(2, 1, 16, avg, sse2); init_fpel_func(2, 1, 16, avg, _8, sse2);
init_fpel_func(1, 1, 32, avg, sse2); init_fpel_func(1, 1, 32, avg, _8, sse2);
init_fpel_func(0, 1, 64, avg, sse2); init_fpel_func(0, 1, 64, avg, _8, sse2);
init_lpf(sse2); init_lpf(sse2);
dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2;
dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2;
@ -483,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact)
init_dir_tm_h_ipred(32, avx); init_dir_tm_h_ipred(32, avx);
} }
if (EXTERNAL_AVX_FAST(cpu_flags)) { if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(1, 0, 32, put, avx); init_fpel_func(1, 0, 32, put, , avx);
init_fpel_func(0, 0, 64, put, avx); init_fpel_func(0, 0, 64, put, , avx);
init_ipred(32, avx, v, VERT); init_ipred(32, avx, v, VERT);
} }
if (EXTERNAL_AVX2(cpu_flags)) { if (EXTERNAL_AVX2(cpu_flags)) {
init_fpel_func(1, 1, 32, avg, avx2); init_fpel_func(1, 1, 32, avg, _8, avx2);
init_fpel_func(0, 1, 64, avg, avx2); init_fpel_func(0, 1, 64, avg, _8, avx2);
if (ARCH_X86_64) { if (ARCH_X86_64) {
#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
init_subpel3_32_64(0, put, avx2); init_subpel3_32_64(0, put, avx2);

View File

@ -23,16 +23,16 @@
#ifndef AVCODEC_X86_VP9DSP_INIT_H #ifndef AVCODEC_X86_VP9DSP_INIT_H
#define AVCODEC_X86_VP9DSP_INIT_H #define AVCODEC_X86_VP9DSP_INIT_H
#define decl_fpel_func(avg, sz, opt) \ #define decl_fpel_func(avg, sz, bpp, opt) \
void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \ const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) int h, int mx, int my)
#define init_fpel_func(idx1, idx2, sz, type, opt) \ #define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp); void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp);

View File

@ -29,14 +29,22 @@
#if HAVE_YASM #if HAVE_YASM
decl_fpel_func(put, 8, mmx); decl_fpel_func(put, 8, , mmx);
decl_fpel_func(put, 16, sse); decl_fpel_func(avg, 8, _16, mmxext);
decl_fpel_func(put, 32, sse); decl_fpel_func(put, 16, , sse);
decl_fpel_func(put, 64, sse); decl_fpel_func(put, 32, , sse);
decl_fpel_func(put, 128, sse); decl_fpel_func(put, 64, , sse);
decl_fpel_func(put, 32, avx); decl_fpel_func(put, 128, , sse);
decl_fpel_func(put, 64, avx); decl_fpel_func(avg, 16, _16, sse2);
decl_fpel_func(put, 128, avx); decl_fpel_func(avg, 32, _16, sse2);
decl_fpel_func(avg, 64, _16, sse2);
decl_fpel_func(avg, 128, _16, sse2);
decl_fpel_func(put, 32, , avx);
decl_fpel_func(put, 64, , avx);
decl_fpel_func(put, 128, , avx);
decl_fpel_func(avg, 32, _16, avx2);
decl_fpel_func(avg, 64, _16, avx2);
decl_fpel_func(avg, 128, _16, avx2);
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */
@ -46,19 +54,37 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp)
int cpu_flags = av_get_cpu_flags(); int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) { if (EXTERNAL_MMX(cpu_flags)) {
init_fpel_func(4, 0, 8, put, mmx); init_fpel_func(4, 0, 8, put, , mmx);
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
init_fpel_func(4, 1, 8, avg, _16, mmxext);
} }
if (EXTERNAL_SSE(cpu_flags)) { if (EXTERNAL_SSE(cpu_flags)) {
init_fpel_func(3, 0, 16, put, sse); init_fpel_func(3, 0, 16, put, , sse);
init_fpel_func(2, 0, 32, put, sse); init_fpel_func(2, 0, 32, put, , sse);
init_fpel_func(1, 0, 64, put, sse); init_fpel_func(1, 0, 64, put, , sse);
init_fpel_func(0, 0, 128, put, sse); init_fpel_func(0, 0, 128, put, , sse);
} }
if (EXTERNAL_SSE2(cpu_flags)) {
init_fpel_func(3, 1, 16, avg, _16, sse2);
init_fpel_func(2, 1, 32, avg, _16, sse2);
init_fpel_func(1, 1, 64, avg, _16, sse2);
init_fpel_func(0, 1, 128, avg, _16, sse2);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) { if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(2, 0, 32, put, avx); init_fpel_func(2, 0, 32, put, , avx);
init_fpel_func(1, 0, 64, put, avx); init_fpel_func(1, 0, 64, put, , avx);
init_fpel_func(0, 0, 128, put, avx); init_fpel_func(0, 0, 128, put, , avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {
init_fpel_func(2, 1, 32, avg, _16, avx2);
init_fpel_func(1, 1, 64, avg, _16, avx2);
init_fpel_func(0, 1, 128, avg, _16, avx2);
} }
#endif /* HAVE_YASM */ #endif /* HAVE_YASM */

View File

@ -553,7 +553,7 @@ filter_vx2_fn avg
%endif ; ARCH_X86_64 %endif ; ARCH_X86_64
%macro fpel_fn 6-7 4 %macro fpel_fn 6-8 0, 4
%if %2 == 4 %if %2 == 4
%define %%srcfn movh %define %%srcfn movh
%define %%dstfn movh %define %%dstfn movh
@ -562,12 +562,22 @@ filter_vx2_fn avg
%define %%dstfn mova %define %%dstfn mova
%endif %endif
%if %7 == 8
%define %%pavg pavgb
%define %%szsuf _8
%elif %7 == 16
%define %%pavg pavgw
%define %%szsuf _16
%else
%define %%szsuf
%endif
%if %2 <= mmsize %if %2 <= mmsize
cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 cglobal vp9_%1%2%%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
lea sstride3q, [sstrideq*3] lea sstride3q, [sstrideq*3]
lea dstride3q, [dstrideq*3] lea dstride3q, [dstrideq*3]
%else %else
cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h cglobal vp9_%1%2%%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
%endif %endif
.loop: .loop:
%%srcfn m0, [srcq] %%srcfn m0, [srcq]
@ -582,10 +592,16 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h
%endif %endif
lea srcq, [srcq+sstrideq*%6] lea srcq, [srcq+sstrideq*%6]
%ifidn %1, avg %ifidn %1, avg
pavgb m0, [dstq] %%pavg m0, [dstq]
pavgb m1, [dstq+d%3] %%pavg m1, [dstq+d%3]
pavgb m2, [dstq+d%4] %%pavg m2, [dstq+d%4]
pavgb m3, [dstq+d%5] %%pavg m3, [dstq+d%5]
%if %2/mmsize == 8
%%pavg m4, [dstq+mmsize*4]
%%pavg m5, [dstq+mmsize*5]
%%pavg m6, [dstq+mmsize*6]
%%pavg m7, [dstq+mmsize*7]
%endif
%endif %endif
%%dstfn [dstq], m0 %%dstfn [dstq], m0
%%dstfn [dstq+d%3], m1 %%dstfn [dstq+d%3], m1
@ -611,25 +627,38 @@ INIT_MMX mmx
fpel_fn put, 4, strideq, strideq*2, stride3q, 4 fpel_fn put, 4, strideq, strideq*2, stride3q, 4
fpel_fn put, 8, strideq, strideq*2, stride3q, 4 fpel_fn put, 8, strideq, strideq*2, stride3q, 4
INIT_MMX mmxext INIT_MMX mmxext
fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8
INIT_XMM sse INIT_XMM sse
fpel_fn put, 16, strideq, strideq*2, stride3q, 4 fpel_fn put, 16, strideq, strideq*2, stride3q, 4
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 8 fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8
INIT_XMM sse2 INIT_XMM sse2
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8
fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8
INIT_YMM avx INIT_YMM avx
fpel_fn put, 32, strideq, strideq*2, stride3q, 4 fpel_fn put, 32, strideq, strideq*2, stride3q, 4
fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1
%if HAVE_AVX2_EXTERNAL %if HAVE_AVX2_EXTERNAL
INIT_YMM avx2 INIT_YMM avx2
fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8
%endif
INIT_MMX mmxext
fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16
INIT_XMM sse2
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16
fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16
fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16
fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16
fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16
fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16
%endif %endif
%undef s16 %undef s16
%undef d16 %undef d16