mirror of https://git.ffmpeg.org/ffmpeg.git
x86/float_dsp: add ff_vector_fmul_reverse_avx2
~20% faster than AVX. Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
5b441d2981
commit
f1d80bc630
|
@ -22,6 +22,9 @@
|
|||
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
|
@ -359,10 +362,16 @@ VECTOR_FMUL_ADD
|
|||
;-----------------------------------------------------------------------------
|
||||
%macro VECTOR_FMUL_REVERSE 0
|
||||
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
|
||||
%if cpuflag(avx2)
|
||||
mova m2, [pd_reverse]
|
||||
%endif
|
||||
lea lenq, [lend*4 - 2*mmsize]
|
||||
ALIGN 16
|
||||
.loop:
|
||||
%if cpuflag(avx)
|
||||
%if cpuflag(avx2)
|
||||
vpermd m0, m2, [src1q]
|
||||
vpermd m1, m2, [src1q+mmsize]
|
||||
%elif cpuflag(avx)
|
||||
vmovaps xmm0, [src1q + 16]
|
||||
vinsertf128 m0, m0, [src1q], 1
|
||||
vshufps m0, m0, m0, q0123
|
||||
|
@ -391,6 +400,10 @@ VECTOR_FMUL_REVERSE
|
|||
INIT_YMM avx
|
||||
VECTOR_FMUL_REVERSE
|
||||
%endif
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
VECTOR_FMUL_REVERSE
|
||||
%endif
|
||||
|
||||
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
|
||||
INIT_XMM sse
|
||||
|
|
|
@ -67,6 +67,8 @@ void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
|
|||
const float *src1, int len);
|
||||
void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
void ff_vector_fmul_reverse_avx2(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
|
||||
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
|
||||
|
||||
|
@ -101,6 +103,9 @@ av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
|
|||
fdsp->vector_fmul_add = ff_vector_fmul_add_avx;
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
|
||||
}
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_avx2;
|
||||
}
|
||||
if (EXTERNAL_FMA3_FAST(cpu_flags)) {
|
||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_fma3;
|
||||
fdsp->vector_fmul_add = ff_vector_fmul_add_fma3;
|
||||
|
|
Loading…
Reference in New Issue