mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-25 00:32:31 +00:00
x86/float_dsp: add SSE2 and AVX versions of scalarproduct_double
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
7413b81e50
commit
a14440867c
@ -567,6 +567,58 @@ cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
|
||||
%endif
|
||||
RET
|
||||
|
||||
;---------------------------------------------------------------------------------
|
||||
; double scalarproduct_double(const double *v1, const double *v2, size_t len)
|
||||
;---------------------------------------------------------------------------------
|
||||
%macro SCALARPRODUCT_DOUBLE 0
|
||||
cglobal scalarproduct_double, 3,3,8, v1, v2, offset
|
||||
shl offsetq, 3
|
||||
add v1q, offsetq
|
||||
add v2q, offsetq
|
||||
neg offsetq
|
||||
xorpd m0, m0
|
||||
xorpd m1, m1
|
||||
movapd m2, m0
|
||||
movapd m3, m1
|
||||
align 16
|
||||
.loop:
|
||||
movapd m4, [v1q+offsetq+mmsize*0]
|
||||
movapd m5, [v1q+offsetq+mmsize*1]
|
||||
movapd m6, [v1q+offsetq+mmsize*2]
|
||||
movapd m7, [v1q+offsetq+mmsize*3]
|
||||
mulpd m4, [v2q+offsetq+mmsize*0]
|
||||
mulpd m5, [v2q+offsetq+mmsize*1]
|
||||
mulpd m6, [v2q+offsetq+mmsize*2]
|
||||
mulpd m7, [v2q+offsetq+mmsize*3]
|
||||
addpd m0, m4
|
||||
addpd m1, m5
|
||||
addpd m2, m6
|
||||
addpd m3, m7
|
||||
add offsetq, mmsize*4
|
||||
jl .loop
|
||||
addpd m0, m1
|
||||
addpd m2, m3
|
||||
addpd m0, m2
|
||||
%if mmsize == 32
|
||||
vextractf128 xm1, m0, 1
|
||||
addpd xm0, xm1
|
||||
%endif
|
||||
movhlps xm1, xm0
|
||||
addsd xm0, xm1
|
||||
%if ARCH_X86_64 == 0
|
||||
movsd r0m, xm0
|
||||
fld qword r0m
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
SCALARPRODUCT_DOUBLE
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_YMM avx
|
||||
SCALARPRODUCT_DOUBLE
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_butterflies_float(float *src0, float *src1, int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
@ -73,6 +73,9 @@ void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
|
||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||
float ff_scalarproduct_float_fma3(const float *v1, const float *v2, int order);
|
||||
|
||||
double ff_scalarproduct_double_sse2(const double *v1, const double *v2, size_t order);
|
||||
double ff_scalarproduct_double_avx(const double *v1, const double *v2, size_t order);
|
||||
|
||||
void ff_butterflies_float_sse(float *restrict src0, float *restrict src1, int len);
|
||||
|
||||
av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||
@ -93,6 +96,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||
fdsp->vector_dmul = ff_vector_dmul_sse2;
|
||||
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_sse2;
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
|
||||
fdsp->scalarproduct_double = ff_scalarproduct_double_sse2;
|
||||
}
|
||||
if (EXTERNAL_AVX_FAST(cpu_flags)) {
|
||||
fdsp->vector_fmul = ff_vector_fmul_avx;
|
||||
@ -102,6 +106,7 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
||||
fdsp->vector_dmac_scalar = ff_vector_dmac_scalar_avx;
|
||||
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
|
||||
fdsp->scalarproduct_double = ff_scalarproduct_double_avx;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
|
||||
|
Loading…
Reference in New Issue
Block a user