x86/aacpsdsp: add ps_hybrid_analysis_fma3

This replace the sse3 version, which was not really faster than the sse one.

Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2022-09-20 16:02:49 -03:00
parent 2bcf86d53d
commit 48615f0a78
2 changed files with 25 additions and 23 deletions

View File

@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT
%macro PS_HYBRID_ANALYSIS_IN 1
movu m0, [inq+mmsize*%1]
movu m1, [inq+mmsize*(5-%1)+8]
mova m3, m0
mova m4, m1
shufps m3, m3, q2301
shufps m4, m4, q0123
shufps m3, m0, m0, q2301
shufps m4, m1, m1, q0123
shufps m1, m1, q1032
%if cpuflag(sse3)
addsubps m3, m4
@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT
%macro PS_HYBRID_ANALYSIS_LOOP 3
mova m2, [filterq+nq+mmsize*%3]
shufps m2, m2, q2301
%if cpuflag(fma3)
%if %3
fmaddps m3, m2, [rsp+mmsize*%3*2], m3
fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0
%else
mulps m3, m2, [rsp]
mulps m0, m2, [rsp+mmsize]
%endif
%else ; cpuflag(sse)
mova %2, [rsp+mmsize*%3*2]
mova %1, [rsp+mmsize+mmsize*%3*2]
mulps %2, m2
@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT
addps m3, %2
addps m0, %1
%endif
%endif
%endmacro
%macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n
cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n
%if cpuflag(sse3)
%define MOVH movsd
%else
%define MOVH movlps
mova m7, [ps_p1m1p1m1]
%endif
shl strideq, 3
shl nd, 6
add filterq, nq
neg nq
mova m7, [ps_p1m1p1m1]
PS_HYBRID_ANALYSIS_IN 0
PS_HYBRID_ANALYSIS_IN 1
PS_HYBRID_ANALYSIS_IN 2
@ -456,30 +464,22 @@ align 16
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3)
pshufd m3, m3, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
shufps m1, m3, m3, q2301
shufps m2, m0, m0, q2301
subps m1, m3
addps m2, m0
unpcklps m3, m1, m2
unpckhps m1, m2
addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6]
SPLATD m3
%if cpuflag(fma3)
fmaddps m1, m2, m3, m1
%else
mulps m2, m3
addps m1, m2
%endif
MOVH [outq], m1
add outq, strideq
add nq, 64
@ -489,5 +489,5 @@ align 16
INIT_XMM sse
PS_HYBRID_ANALYSIS
INIT_XMM sse3
INIT_XMM fma3
PS_HYBRID_ANALYSIS

View File

@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2],
const float (*filter)[8][2],
ptrdiff_t stride, int n);
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
}
if (EXTERNAL_FMA3(cpu_flags)) {
s->hybrid_analysis = ff_ps_hybrid_analysis_fma3;
}
}