mirror of https://git.ffmpeg.org/ffmpeg.git
avutil/x86/float_dsp: add fma3 for scalarproduct
This commit is contained in:
parent
cf2cf31805
commit
72acff9f59
|
@ -440,6 +440,133 @@ cglobal scalarproduct_float, 3,3,2, v1, v2, offset
|
||||||
%endif
|
%endif
|
||||||
RET
|
RET
|
||||||
|
|
||||||
|
INIT_YMM fma3
|
||||||
|
cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
|
||||||
|
xor offsetq, offsetq
|
||||||
|
xorps m0, m0
|
||||||
|
shl sized, 2
|
||||||
|
mov lenq, sizeq
|
||||||
|
cmp lenq, 32
|
||||||
|
jl .l16
|
||||||
|
cmp lenq, 64
|
||||||
|
jl .l32
|
||||||
|
xorps m1, m1
|
||||||
|
cmp lenq, 128
|
||||||
|
jl .l64
|
||||||
|
and lenq, ~127
|
||||||
|
xorps m2, m2
|
||||||
|
xorps m3, m3
|
||||||
|
.loop128:
|
||||||
|
movups m4, [v1q+offsetq]
|
||||||
|
movups m5, [v1q+offsetq + 32]
|
||||||
|
movups m6, [v1q+offsetq + 64]
|
||||||
|
movups m7, [v1q+offsetq + 96]
|
||||||
|
fmaddps m0, m4, [v2q+offsetq ], m0
|
||||||
|
fmaddps m1, m5, [v2q+offsetq + 32], m1
|
||||||
|
fmaddps m2, m6, [v2q+offsetq + 64], m2
|
||||||
|
fmaddps m3, m7, [v2q+offsetq + 96], m3
|
||||||
|
add offsetq, 128
|
||||||
|
cmp offsetq, lenq
|
||||||
|
jl .loop128
|
||||||
|
addps m0, m2
|
||||||
|
addps m1, m3
|
||||||
|
mov lenq, sizeq
|
||||||
|
and lenq, 127
|
||||||
|
cmp lenq, 64
|
||||||
|
jge .l64
|
||||||
|
addps m0, m1
|
||||||
|
cmp lenq, 32
|
||||||
|
jge .l32
|
||||||
|
vextractf128 xmm2, m0, 1
|
||||||
|
addps xmm0, xmm2
|
||||||
|
cmp lenq, 16
|
||||||
|
jge .l16
|
||||||
|
movhlps xmm1, xmm0
|
||||||
|
addps xmm0, xmm1
|
||||||
|
movss xmm1, xmm0
|
||||||
|
shufps xmm0, xmm0, 1
|
||||||
|
addss xmm0, xmm1
|
||||||
|
%if ARCH_X86_64 == 0
|
||||||
|
movss r0m, xm0
|
||||||
|
fld dword r0m
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
.l64:
|
||||||
|
and lenq, ~63
|
||||||
|
add lenq, offsetq
|
||||||
|
.loop64:
|
||||||
|
movups m4, [v1q+offsetq]
|
||||||
|
movups m5, [v1q+offsetq + 32]
|
||||||
|
fmaddps m0, m4, [v2q+offsetq], m0
|
||||||
|
fmaddps m1, m5, [v2q+offsetq + 32], m1
|
||||||
|
add offsetq, 64
|
||||||
|
cmp offsetq, lenq
|
||||||
|
jl .loop64
|
||||||
|
addps m0, m1
|
||||||
|
mov lenq, sizeq
|
||||||
|
and lenq, 63
|
||||||
|
cmp lenq, 32
|
||||||
|
jge .l32
|
||||||
|
vextractf128 xmm2, m0, 1
|
||||||
|
addps xmm0, xmm2
|
||||||
|
cmp lenq, 16
|
||||||
|
jge .l16
|
||||||
|
movhlps xmm1, xmm0
|
||||||
|
addps xmm0, xmm1
|
||||||
|
movss xmm1, xmm0
|
||||||
|
shufps xmm0, xmm0, 1
|
||||||
|
addss xmm0, xmm1
|
||||||
|
%if ARCH_X86_64 == 0
|
||||||
|
movss r0m, xm0
|
||||||
|
fld dword r0m
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
.l32:
|
||||||
|
and lenq, ~31
|
||||||
|
add lenq, offsetq
|
||||||
|
.loop32:
|
||||||
|
movups m4, [v1q+offsetq]
|
||||||
|
fmaddps m0, m4, [v2q+offsetq], m0
|
||||||
|
add offsetq, 32
|
||||||
|
cmp offsetq, lenq
|
||||||
|
jl .loop32
|
||||||
|
vextractf128 xmm2, m0, 1
|
||||||
|
addps xmm0, xmm2
|
||||||
|
mov lenq, sizeq
|
||||||
|
and lenq, 31
|
||||||
|
cmp lenq, 16
|
||||||
|
jge .l16
|
||||||
|
movhlps xmm1, xmm0
|
||||||
|
addps xmm0, xmm1
|
||||||
|
movss xmm1, xmm0
|
||||||
|
shufps xmm0, xmm0, 1
|
||||||
|
addss xmm0, xmm1
|
||||||
|
%if ARCH_X86_64 == 0
|
||||||
|
movss r0m, xm0
|
||||||
|
fld dword r0m
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
.l16:
|
||||||
|
and lenq, ~15
|
||||||
|
add lenq, offsetq
|
||||||
|
.loop16:
|
||||||
|
movaps xmm1, [v1q+offsetq]
|
||||||
|
mulps xmm1, [v2q+offsetq]
|
||||||
|
addps xmm0, xmm1
|
||||||
|
add offsetq, 16
|
||||||
|
cmp offsetq, lenq
|
||||||
|
jl .loop16
|
||||||
|
movhlps xmm1, xmm0
|
||||||
|
addps xmm0, xmm1
|
||||||
|
movss xmm1, xmm0
|
||||||
|
shufps xmm0, xmm0, 1
|
||||||
|
addss xmm0, xmm1
|
||||||
|
%if ARCH_X86_64 == 0
|
||||||
|
movss r0m, xm0
|
||||||
|
fld dword r0m
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
; void ff_butterflies_float(float *src0, float *src1, int len);
|
; void ff_butterflies_float(float *src0, float *src1, int len);
|
||||||
;-----------------------------------------------------------------------------
|
;-----------------------------------------------------------------------------
|
||||||
|
|
|
@ -74,6 +74,7 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
|
||||||
const float *src1, int len);
|
const float *src1, int len);
|
||||||
|
|
||||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||||
|
float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
|
||||||
|
|
||||||
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
|
void ff_butterflies_float_sse(float *av_restrict src0, float *av_restrict src1, int len);
|
||||||
|
|
||||||
|
@ -112,5 +113,6 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
|
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
|
||||||
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
|
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
|
||||||
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
|
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_fma3;
|
||||||
|
fdsp->scalarproduct_float = ff_scalarproduct_float_fma3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue