From 72acff9f593f977944a62652fc9dd346ec53225a Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Wed, 20 Jan 2021 16:58:31 +0100 Subject: [PATCH] avutil/x86/float_dsp: add fma3 for scalarproduct --- libavutil/x86/float_dsp.asm | 127 +++++++++++++++++++++++++++++++++ libavutil/x86/float_dsp_init.c | 2 + 2 files changed, 129 insertions(+) diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index cca4d019c7..8f8e6dddf5 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset %endif RET +INIT_YMM fma3 +cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset + xor offsetq, offsetq + xorps m0, m0 + shl sized, 2 + mov lenq, sizeq + cmp lenq, 32 + jl .l16 + cmp lenq, 64 + jl .l32 + xorps m1, m1 + cmp lenq, 128 + jl .l64 + and lenq, ~127 + xorps m2, m2 + xorps m3, m3 +.loop128: + movups m4, [v1q+offsetq] + movups m5, [v1q+offsetq + 32] + movups m6, [v1q+offsetq + 64] + movups m7, [v1q+offsetq + 96] + fmaddps m0, m4, [v2q+offsetq ], m0 + fmaddps m1, m5, [v2q+offsetq + 32], m1 + fmaddps m2, m6, [v2q+offsetq + 64], m2 + fmaddps m3, m7, [v2q+offsetq + 96], m3 + add offsetq, 128 + cmp offsetq, lenq + jl .loop128 + addps m0, m2 + addps m1, m3 + mov lenq, sizeq + and lenq, 127 + cmp lenq, 64 + jge .l64 + addps m0, m1 + cmp lenq, 32 + jge .l32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l64: + and lenq, ~63 + add lenq, offsetq +.loop64: + movups m4, [v1q+offsetq] + movups m5, [v1q+offsetq + 32] + fmaddps m0, m4, [v2q+offsetq], m0 + fmaddps m1, m5, [v2q+offsetq + 32], m1 + add offsetq, 64 + cmp offsetq, lenq + jl .loop64 + addps m0, m1 + mov lenq, sizeq + and lenq, 63 + cmp lenq, 32 + jge .l32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l32: + and lenq, ~31 + add lenq, offsetq +.loop32: + movups m4, [v1q+offsetq] + fmaddps m0, m4, [v2q+offsetq], m0 + add offsetq, 32 + cmp offsetq, lenq + jl .loop32 + vextractf128 xmm2, m0, 1 + addps xmm0, xmm2 + mov lenq, sizeq + and lenq, 31 + cmp lenq, 16 + jge .l16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET +.l16: + and lenq, ~15 + add lenq, offsetq +.loop16: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + cmp offsetq, lenq + jl .loop16 + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xm0 + fld dword r0m +%endif + RET + ;----------------------------------------------------------------------------- ; void ff_butterflies_float(float *src0, float *src1, int len); ;----------------------------------------------------------------------------- diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c index ad17bc2044..ad6b506259 100644 --- a/libavutil/x86/float_dsp_init.c +++ b/libavutil/x86/float_dsp_init.c @@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0, const float *src1, int len); float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); +float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order); void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len); @@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp) fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3; fdsp->vector_fmul_add = ff_vector_fmul_add_fma3; fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3; + fdsp->scalarproduct_float = ff_scalarproduct_float_fma3; } }