From 2bcf86d53df64276fbe1b95d8eaf335be0a766ab Mon Sep 17 00:00:00 2001 From: James Almer Date: Tue, 20 Sep 2022 15:09:40 -0300 Subject: [PATCH] x86/aacpsdsp: precompute constant factors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inspired by the optimization done to the C version by RĂ©mi Denis-Courmont. Signed-off-by: James Almer --- libavcodec/x86/aacpsdsp.asm | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index 4acd087c85..543d33e68d 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -400,29 +400,32 @@ HYBRID_SYNTHESIS_DEINT ; const float (*filter)[8][2], ; ptrdiff_t stride, int n); ;******************************************************************* -%macro PS_HYBRID_ANALYSIS_LOOP 3 - movu %1, [inq+mmsize*%3] - movu m1, [inq+mmsize*(5-%3)+8] -%if cpuflag(sse3) - pshufd %2, %1, q2301 - pshufd m4, m1, q0123 - pshufd m1, m1, q1032 - pshufd m2, [filterq+nq+mmsize*%3], q2301 - addsubps %2, m4 - addsubps %1, m1 -%else - mova m2, [filterq+nq+mmsize*%3] - mova %2, %1 +%macro PS_HYBRID_ANALYSIS_IN 1 + movu m0, [inq+mmsize*%1] + movu m1, [inq+mmsize*(5-%1)+8] + mova m3, m0 mova m4, m1 - shufps %2, %2, q2301 + shufps m3, m3, q2301 shufps m4, m4, q0123 shufps m1, m1, q1032 - shufps m2, m2, q2301 +%if cpuflag(sse3) + addsubps m3, m4 + addsubps m0, m1 +%else xorps m4, m7 xorps m1, m7 - subps %2, m4 - subps %1, m1 + subps m3, m4 + subps m0, m1 %endif + mova [rsp+mmsize*%1*2], m3 + mova [rsp+mmsize+mmsize*%1*2], m0 +%endmacro + +%macro PS_HYBRID_ANALYSIS_LOOP 3 + mova m2, [filterq+nq+mmsize*%3] + shufps m2, m2, q2301 + mova %2, [rsp+mmsize*%3*2] + mova %1, [rsp+mmsize+mmsize*%3*2] mulps %2, m2 mulps %1, m2 %if %3 @@ -432,7 +435,7 @@ HYBRID_SYNTHESIS_DEINT %endmacro %macro PS_HYBRID_ANALYSIS 0 -cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n +cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n %if cpuflag(sse3) %define MOVH movsd %else @@ -443,6 +446,9 @@ cglobal ps_hybrid_analysis, 5, 5, 8, out, in, filter, stride, n add filterq, nq neg nq mova m7, [ps_p1m1p1m1] + PS_HYBRID_ANALYSIS_IN 0 + PS_HYBRID_ANALYSIS_IN 1 + PS_HYBRID_ANALYSIS_IN 2 align 16 .loop: