avutil/x86/float_dsp: add fma3 for scalarproduct

2021-01-20 16:58:31 +01:00 · 2021-01-20 16:58:31 +01:00 · 72acff9f59
parent cf2cf31805
commit 72acff9f59
2 changed files with 129 additions and 0 deletions
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
 %endif
    RET
 INIT_YMM fma3
 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
    xor   offsetq, offsetq
    xorps      m0, m0
    shl     sized, 2
    mov      lenq, sizeq
    cmp      lenq, 32
    jl   .l16
    cmp      lenq, 64
    jl   .l32
    xorps    m1, m1
    cmp      lenq, 128
    jl   .l64
    and    lenq, ~127
    xorps    m2, m2
    xorps    m3, m3
 .loop128:
    movups   m4, [v1q+offsetq]
    movups   m5, [v1q+offsetq + 32]
    movups   m6, [v1q+offsetq + 64]
    movups   m7, [v1q+offsetq + 96]
    fmaddps  m0, m4, [v2q+offsetq     ], m0
    fmaddps  m1, m5, [v2q+offsetq + 32], m1
    fmaddps  m2, m6, [v2q+offsetq + 64], m2
    fmaddps  m3, m7, [v2q+offsetq + 96], m3
    add   offsetq, 128
    cmp   offsetq, lenq
    jl .loop128
    addps    m0, m2
    addps    m1, m3
    mov      lenq, sizeq
    and      lenq, 127
    cmp      lenq, 64
    jge .l64
    addps    m0, m1
    cmp      lenq, 32
    jge .l32
    vextractf128 xmm2, m0, 1
    addps    xmm0, xmm2
    cmp      lenq, 16
    jge .l16
    movhlps  xmm1, xmm0
    addps    xmm0, xmm1
    movss    xmm1, xmm0
    shufps   xmm0, xmm0, 1
    addss    xmm0, xmm1
 %if ARCH_X86_64 == 0
    movss r0m, xm0
    fld dword r0m
 %endif
    RET
 .l64:
    and    lenq, ~63
    add    lenq, offsetq
 .loop64:
    movups   m4, [v1q+offsetq]
    movups   m5, [v1q+offsetq + 32]
    fmaddps  m0, m4, [v2q+offsetq], m0
    fmaddps  m1, m5, [v2q+offsetq + 32], m1
    add   offsetq, 64
    cmp   offsetq, lenq
    jl .loop64
    addps    m0, m1
    mov      lenq, sizeq
    and      lenq, 63
    cmp      lenq, 32
    jge .l32
    vextractf128 xmm2, m0, 1
    addps    xmm0, xmm2
    cmp      lenq, 16
    jge .l16
    movhlps  xmm1, xmm0
    addps    xmm0, xmm1
    movss    xmm1, xmm0
    shufps   xmm0, xmm0, 1
    addss    xmm0, xmm1
 %if ARCH_X86_64 == 0
    movss r0m, xm0
    fld dword r0m
 %endif
    RET
 .l32:
    and    lenq, ~31
    add    lenq, offsetq
 .loop32:
    movups   m4, [v1q+offsetq]
    fmaddps  m0, m4, [v2q+offsetq], m0
    add   offsetq, 32
    cmp   offsetq, lenq
    jl .loop32
    vextractf128 xmm2, m0, 1
    addps    xmm0, xmm2
    mov      lenq, sizeq
    and      lenq, 31
    cmp      lenq, 16
    jge .l16
    movhlps  xmm1, xmm0
    addps    xmm0, xmm1
    movss    xmm1, xmm0
    shufps   xmm0, xmm0, 1
    addss    xmm0, xmm1
 %if ARCH_X86_64 == 0
    movss r0m, xm0
    fld dword r0m
 %endif
    RET
 .l16:
    and    lenq, ~15
    add    lenq, offsetq
 .loop16:
    movaps   xmm1, [v1q+offsetq]
    mulps    xmm1, [v2q+offsetq]
    addps    xmm0, xmm1
    add   offsetq, 16
    cmp   offsetq, lenq
    jl .loop16
    movhlps  xmm1, xmm0
    addps    xmm0, xmm1
    movss    xmm1, xmm0
    shufps   xmm0, xmm0, 1
    addss    xmm0, xmm1
 %if ARCH_X86_64 == 0
    movss r0m, xm0
    fld dword r0m
 %endif
    RET
 ;-----------------------------------------------------------------------------
 ; void ff_butterflies_float(float *src0, float *src1, int len);
 ;-----------------------------------------------------------------------------
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
                                 const float *src1, int len);
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
 void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
        fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
        fdsp->vector_fmul_add    = ff_vector_fmul_add_fma3;
        fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
        fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
    }
 }