avutil/x86/float_dsp: add fma3 for scalarproduct

This commit is contained in:
Paul B Mahol 2021-01-20 16:58:31 +01:00
parent cf2cf31805
commit 72acff9f59
2 changed files with 129 additions and 0 deletions

View File

@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
%endif
RET
INIT_YMM fma3
cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
xor offsetq, offsetq
xorps m0, m0
shl sized, 2
mov lenq, sizeq
cmp lenq, 32
jl .l16
cmp lenq, 64
jl .l32
xorps m1, m1
cmp lenq, 128
jl .l64
and lenq, ~127
xorps m2, m2
xorps m3, m3
.loop128:
movups m4, [v1q+offsetq]
movups m5, [v1q+offsetq + 32]
movups m6, [v1q+offsetq + 64]
movups m7, [v1q+offsetq + 96]
fmaddps m0, m4, [v2q+offsetq ], m0
fmaddps m1, m5, [v2q+offsetq + 32], m1
fmaddps m2, m6, [v2q+offsetq + 64], m2
fmaddps m3, m7, [v2q+offsetq + 96], m3
add offsetq, 128
cmp offsetq, lenq
jl .loop128
addps m0, m2
addps m1, m3
mov lenq, sizeq
and lenq, 127
cmp lenq, 64
jge .l64
addps m0, m1
cmp lenq, 32
jge .l32
vextractf128 xmm2, m0, 1
addps xmm0, xmm2
cmp lenq, 16
jge .l16
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xm0
fld dword r0m
%endif
RET
.l64:
and lenq, ~63
add lenq, offsetq
.loop64:
movups m4, [v1q+offsetq]
movups m5, [v1q+offsetq + 32]
fmaddps m0, m4, [v2q+offsetq], m0
fmaddps m1, m5, [v2q+offsetq + 32], m1
add offsetq, 64
cmp offsetq, lenq
jl .loop64
addps m0, m1
mov lenq, sizeq
and lenq, 63
cmp lenq, 32
jge .l32
vextractf128 xmm2, m0, 1
addps xmm0, xmm2
cmp lenq, 16
jge .l16
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xm0
fld dword r0m
%endif
RET
.l32:
and lenq, ~31
add lenq, offsetq
.loop32:
movups m4, [v1q+offsetq]
fmaddps m0, m4, [v2q+offsetq], m0
add offsetq, 32
cmp offsetq, lenq
jl .loop32
vextractf128 xmm2, m0, 1
addps xmm0, xmm2
mov lenq, sizeq
and lenq, 31
cmp lenq, 16
jge .l16
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xm0
fld dword r0m
%endif
RET
.l16:
and lenq, ~15
add lenq, offsetq
.loop16:
movaps xmm1, [v1q+offsetq]
mulps xmm1, [v2q+offsetq]
addps xmm0, xmm1
add offsetq, 16
cmp offsetq, lenq
jl .loop16
movhlps xmm1, xmm0
addps xmm0, xmm1
movss xmm1, xmm0
shufps xmm0, xmm0, 1
addss xmm0, xmm1
%if ARCH_X86_64 == 0
movss r0m, xm0
fld dword r0m
%endif
RET
;-----------------------------------------------------------------------------
; void ff_butterflies_float(float *src0, float *src1, int len);
;-----------------------------------------------------------------------------

View File

@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
const float *src1, int len);
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
}
}