From ed9b25a148f228433d65c31ec8d65d5ad1983215 Mon Sep 17 00:00:00 2001 From: James Almer Date: Mon, 10 Apr 2017 12:17:03 -0300 Subject: [PATCH] x86/float_dsp: add ff_vector_dmac_scalar_{sse2,avx,fma3} --- libavutil/x86/float_dsp.asm | 63 ++++++++++++++++++++++++++++++++++ libavutil/x86/float_dsp_init.c | 10 ++++++ 2 files changed, 73 insertions(+) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 021ff03c87..9affacb72b 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -148,6 +148,69 @@ cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len INIT_XMM sse VECTOR_FMUL_SCALAR +;------------------------------------------------------------------------------ +; void ff_vector_dmac_scalar(double *dst, const double *src, double mul, +; int len) +;------------------------------------------------------------------------------ + +%macro VECTOR_DMAC_SCALAR 0 +%if ARCH_X86_32 +cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr + mov lenq, lenaddrm + VBROADCASTSD m0, mulm +%else +%if UNIX64 +cglobal vector_dmac_scalar, 3,3,5, dst, src, len +%else +cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len + SWAP 0, 2 +%endif + movlhps xm0, xm0 +%if cpuflag(avx) + vinsertf128 m0, m0, xm0, 1 +%endif +%endif + lea lenq, [lend*8-mmsize*4] +.loop: +%if cpuflag(fma3) + movaps m1, [dstq+lenq] + movaps m2, [dstq+lenq+1*mmsize] + movaps m3, [dstq+lenq+2*mmsize] + movaps m4, [dstq+lenq+3*mmsize] + fmaddpd m1, m0, [srcq+lenq], m1 + fmaddpd m2, m0, [srcq+lenq+1*mmsize], m2 + fmaddpd m3, m0, [srcq+lenq+2*mmsize], m3 + fmaddpd m4, m0, [srcq+lenq+3*mmsize], m4 +%else ; cpuflag + mulpd m1, m0, [srcq+lenq] + mulpd m2, m0, [srcq+lenq+1*mmsize] + mulpd m3, m0, [srcq+lenq+2*mmsize] + mulpd m4, m0, [srcq+lenq+3*mmsize] + addpd m1, m1, [dstq+lenq] + addpd m2, m2, [dstq+lenq+1*mmsize] + addpd m3, m3, [dstq+lenq+2*mmsize] + addpd m4, m4, [dstq+lenq+3*mmsize] +%endif ; cpuflag + movaps [dstq+lenq], m1 + movaps [dstq+lenq+1*mmsize], m2 + movaps [dstq+lenq+2*mmsize], m3 + movaps [dstq+lenq+3*mmsize], m4 + sub lenq, mmsize*4 + jge .loop + REP_RET +%endmacro + +INIT_XMM sse2 +VECTOR_DMAC_SCALAR +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +VECTOR_DMAC_SCALAR +%endif +%if HAVE_FMA3_EXTERNAL +INIT_YMM fma3 +VECTOR_DMAC_SCALAR +%endif + ;------------------------------------------------------------------------------ ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul, ; int len) diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index 736b9ad8ac..09c7a4d3b2 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -39,6 +39,13 @@ void ff_vector_fmac_scalar_fma3(float *dst, const float *src, float mul, void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul, int len); +void ff_vector_dmac_scalar_sse2(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_avx(double *dst, const double *src, double mul, + int len); +void ff_vector_dmac_scalar_fma3(double *dst, const double *src, double mul, + int len); + void ff_vector_dmul_scalar_sse2(double *dst, const double *src, double mul, int len); void ff_vector_dmul_scalar_avx(double *dst, const double *src, @@ -83,17 +90,20 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->butterflies_float = ff_butterflies_float_sse; } if (EXTERNAL_SSE2(cpu_flags)) { + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2; } if (EXTERNAL_AVX_FAST(cpu_flags)) { fdsp->vector_fmul = ff_vector_fmul_avx; fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx; fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx; fdsp->vector_fmul_add = ff_vector_fmul_add_avx; fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx; } if (EXTERNAL_FMA3_FAST(cpu_flags)) { fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; + fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; } }