From f1d80bc6305221506ad96f0ab82088f9229881ab Mon Sep 17 00:00:00 2001 From: James Almer Date: Tue, 11 Apr 2017 21:29:09 -0300 Subject: [PATCH] x86/float_dsp: add ff_vector_fmul_reverse_avx2 ~20% faster than AVX. Signed-off-by: James Almer --- libavutil/x86/float_dsp.asm | 15 ++++++++++++++- libavutil/x86/float_dsp_init.c | 5 +++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 9affacb72b..edade0d55d 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -22,6 +22,9 @@ %include "x86util.asm" +SECTION_RODATA 32 +pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0 + SECTION .text ;----------------------------------------------------------------------------- @@ -359,10 +362,16 @@ VECTOR_FMUL_ADD ;----------------------------------------------------------------------------- %macro VECTOR_FMUL_REVERSE 0 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len +%if cpuflag(avx2) + mova m2, [pd_reverse] +%endif lea lenq, [lend*4 - 2*mmsize] ALIGN 16 .loop: -%if cpuflag(avx) +%if cpuflag(avx2) + vpermd m0, m2, [src1q] + vpermd m1, m2, [src1q+mmsize] +%elif cpuflag(avx) vmovaps xmm0, [src1q + 16] vinsertf128 m0, m0, [src1q], 1 vshufps m0, m0, m0, q0123 @@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE INIT_YMM avx VECTOR_FMUL_REVERSE %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +VECTOR_FMUL_REVERSE +%endif ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) INIT_XMM sse diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 09c7a4d3b2..122087a196 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len); void ff_vector_fmul_reverse_avx(float *dst, const float *src0, const float *src1, int len); +void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, + const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); @@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2; + } if (EXTERNAL_FMA3_FAST(cpu_flags)) { fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;