x86: sbrdsp: Implement SSE2 qmf_deint_bfly

Sandybridge: 47 cycles

Having a loop counter is a 7 cycle gain.
Unrolling is another 7 cycle gain.
Working in reverse scan is another 6 cycles.

Signed-off-by: Diego Biurrun <diego@biurrun.de>
This commit is contained in:
Christophe Gisquet 2013-04-09 21:57:07 +00:00 committed by Diego Biurrun
parent 769d921f3e
commit 5a97469a4f
2 changed files with 33 additions and 0 deletions

View File

@ -245,3 +245,31 @@ cglobal sbr_neg_odd_64, 1,2,4,z
cmp zq, r1q
jne .loop
REP_RET
INIT_XMM sse2
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m2, [src0q+cq+mmsize]
mova m3, [src1q+mmsize]
pshufd m4, m0, q0123
pshufd m5, m1, q0123
pshufd m6, m2, q0123
pshufd m7, m3, q0123
addps m3, m4
subps m0, m7
addps m1, m6
subps m2, m5
mova [vrevq], m1
mova [vrevq+mmsize], m3
mova [vq+cq], m0
mova [vq+cq+mmsize], m2
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET

View File

@ -34,6 +34,7 @@ void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
@ -47,4 +48,8 @@ av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
}
if (EXTERNAL_SSE2(mm_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
}
}