From b5a0971ff041badbdd1482e4ae2a0a16700a748f Mon Sep 17 00:00:00 2001 From: James Almer Date: Tue, 23 May 2017 15:19:39 -0300 Subject: [PATCH] x86/aacps: add ff_ps_stereo_interpolate_ipdopd_sse3() About 2x faster than the c version. Signed-off-by: James Almer --- libavcodec/x86/aacpsdsp.asm | 51 ++++++++++++++++++++++++++++++++++ libavcodec/x86/aacpsdsp_init.c | 4 +++ 2 files changed, 55 insertions(+) diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm index e92cbbce08..bb8a7f5df0 100644 --- a/libavcodec/x86/aacpsdsp.asm +++ b/libavcodec/x86/aacpsdsp.asm @@ -117,6 +117,57 @@ align 16 .ret: REP_RET +;*************************************************************************** +;void ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], +; float h[2][4], float h_step[2][4], +; int len); +;*************************************************************************** +INIT_XMM sse3 +cglobal ps_stereo_interpolate_ipdopd, 5, 5, 10, l, r, h, h_step, n + cmp nd, 0 + jle .ret + movaps m0, [hq] + movaps m1, [hq+mmsize] +%if ARCH_X86_64 + movaps m8, [h_stepq] + movaps m9, [h_stepq+mmsize] + %define H_STEP0 m8 + %define H_STEP1 m9 +%else + %define H_STEP0 [h_stepq] + %define H_STEP1 [h_stepq+mmsize] +%endif + shl nd, 3 + add lq, nq + add rq, nq + neg nq + +align 16 +.loop: + addps m0, H_STEP0 + addps m1, H_STEP1 + movddup m2, [lq+nq] + movddup m3, [rq+nq] + shufps m4, m2, m2, q2301 + shufps m5, m3, m3, q2301 + unpcklps m6, m0, m0 + unpckhps m7, m0, m0 + mulps m2, m6 + mulps m3, m7 + unpcklps m6, m1, m1 + unpckhps m7, m1, m1 + mulps m4, m6 + mulps m5, m7 + addps m2, m3 + addsubps m4, m5 + addsubps m2, m4 + movsd [lq+nq], m2 + movhps [rq+nq], m2 + add nq, 8 + jl .loop +.ret: + REP_RET + ;******************************************************************* ;void ff_ps_hybrid_analysis_(float (*out)[2], float (*in)[2], ; const float (*filter)[8][2], diff --git a/libavcodec/x86/aacpsdsp_init.c b/libavcodec/x86/aacpsdsp_init.c index f6d6c039c3..767ae6588e 100644 --- a/libavcodec/x86/aacpsdsp_init.c +++ b/libavcodec/x86/aacpsdsp_init.c @@ -37,6 +37,9 @@ void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2], void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], float h[2][4], float h_step[2][4], int len); +void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2], + float h[2][4], float h_step[2][4], + int len); av_cold void ff_psdsp_init_x86(PSDSPContext *s) { @@ -50,6 +53,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s) if (EXTERNAL_SSE3(cpu_flags)) { s->add_squares = ff_ps_add_squares_sse3; s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3; + s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3; s->hybrid_analysis = ff_ps_hybrid_analysis_sse3; } }