From 2784d187919b48022a89633fb3b5a99ca97cf869 Mon Sep 17 00:00:00 2001 From: Christophe GISQUET Date: Thu, 23 Feb 2012 20:12:39 +0100 Subject: [PATCH] SBR DSP x86: implement SSE sbr_hf_g_filt Unrolling the main loop to process, instead of 4 elements: - 8: minor gain of 2 cycles (not worth the extra object size) - 2: loss of 8 cycles. Assigning STEP to a register is a loss. Output address (Y) is almost always unaligned. Timings: - C (32/64 bits): 117/109 cycles - SSE: 57 cycles Signed-off-by: Ronald S. Bultje --- libavcodec/x86/sbrdsp.asm | 40 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/sbrdsp_init.c | 3 +++ 2 files changed, 43 insertions(+) diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm index 71471bd5ab..c165c52ca4 100644 --- a/libavcodec/x86/sbrdsp.asm +++ b/libavcodec/x86/sbrdsp.asm @@ -72,3 +72,43 @@ cglobal sbr_sum_square, 2, 3, 6 fld dword r0m %endif RET + +%define STEP 40*4*2 +cglobal sbr_hf_g_filt, 5, 6, 5 + lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high + mov r5, r3 + and r3, 0xFC + lea r2, [r2 + r3*4] + lea r0, [r0 + r3*8] + neg r3 +.loop4: + movq m0, [r2 + 4*r3 + 0] + movq m1, [r2 + 4*r3 + 8] + movq m2, [r1 + 0*STEP] + movq m3, [r1 + 2*STEP] + movhps m2, [r1 + 1*STEP] + movhps m3, [r1 + 3*STEP] + punpckldq m0, m0 + punpckldq m1, m1 + mulps m0, m2 + mulps m1, m3 + movu [r0 + 8*r3 + 0], m0 + movu [r0 + 8*r3 + 16], m1 + add r1, 4*STEP + add r3, 4 + jnz .loop4 + and r5, 3 ; number of single element loops + jz .end +.loop1: ; element 0 and 1 can be computed at the same time + movss m0, [r2] + movq m2, [r1] + punpckldq m0, m0 + mulps m2, m0 + movq [r0], m2 + add r0, 8 + add r2, 4 + add r1, STEP + dec r5 + jnz .loop1 +.end: + RET diff --git a/libavcodec/x86/sbrdsp_init.c b/libavcodec/x86/sbrdsp_init.c index 313f492054..0ffe5b9e11 100644 --- a/libavcodec/x86/sbrdsp_init.c +++ b/libavcodec/x86/sbrdsp_init.c @@ -24,6 +24,8 @@ #include "libavcodec/sbrdsp.h" float ff_sbr_sum_square_sse(float (*x)[2], int n); +void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2], + const float *g_filt, int m_max, intptr_t ixh); void ff_sbrdsp_init_x86(SBRDSPContext *s) { @@ -32,6 +34,7 @@ void ff_sbrdsp_init_x86(SBRDSPContext *s) if (mm_flags & AV_CPU_FLAG_SSE) { s->sum_square = ff_sbr_sum_square_sse; + s->hf_g_filt = ff_sbr_hf_g_filt_sse; } } }