x86/dcadec: add ff_lfe_fir1_float_{sse3,avx}

Reviewed-by: Christophe Gisquet <christophe.gisquet@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2016-02-22 19:59:07 -03:00
parent 37afeabd1b
commit 45d3af9059
2 changed files with 87 additions and 1 deletions

View File

@ -201,3 +201,82 @@ LFE_FIR0_FLOAT
INIT_XMM fma3
LFE_FIR0_FLOAT
%endif
%macro LFE_FIR1_FLOAT 0
cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
shr nblocksd, 2
sub lfeq, 3*sizeof_float
mov cnt1d, 64*sizeof_float
mov cnt2d, 64*sizeof_float-16
lea coeffq, [coeffq+cnt1q*4]
add samplesq, cnt1q
neg cnt1q
.loop:
%if cpuflag(avx)
cvtdq2ps m4, [lfeq]
shufps m5, m4, m4, q0123
%elif cpuflag(sse2)
movu m4, [lfeq]
cvtdq2ps m4, m4
pshufd m5, m4, q0123
%endif
.inner_loop:
movaps m6, [coeffq+cnt1q*4 ]
movaps m7, [coeffq+cnt1q*4+16]
mulps m0, m5, m6
mulps m1, m5, m7
%if ARCH_X86_64
movaps m8, [coeffq+cnt1q*4+32]
movaps m9, [coeffq+cnt1q*4+48]
mulps m2, m5, m8
mulps m3, m5, m9
%else
mulps m2, m5, [coeffq+cnt1q*4+32]
mulps m3, m5, [coeffq+cnt1q*4+48]
%endif
haddps m0, m1
haddps m2, m3
haddps m0, m2
movaps [samplesq+cnt1q], m0
mulps m6, m4
mulps m7, m4
%if ARCH_X86_64
mulps m8, m4
mulps m9, m4
haddps m6, m7
haddps m8, m9
haddps m6, m8
%else
mulps m2, m4, [coeffq+cnt1q*4+32]
mulps m3, m4, [coeffq+cnt1q*4+48]
haddps m6, m7
haddps m2, m3
haddps m6, m2
%endif
movaps [samplesq+cnt2q], m6
sub cnt2d, 16
add cnt1q, 16
jl .inner_loop
add lfeq, sizeof_float
add samplesq, 128*sizeof_float
mov cnt1q, -64*sizeof_float
mov cnt2d, 64*sizeof_float-16
sub nblocksd, 1
jg .loop
RET
%endmacro
INIT_XMM sse3
LFE_FIR1_FLOAT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
LFE_FIR1_FLOAT
%endif

View File

@ -23,10 +23,13 @@
#define LFE_FIR_FLOAT_FUNC(opt) \
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks); \
void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
const float *filter_coeff, ptrdiff_t npcmblocks);
LFE_FIR_FLOAT_FUNC(sse)
LFE_FIR_FLOAT_FUNC(sse2)
LFE_FIR_FLOAT_FUNC(sse3)
LFE_FIR_FLOAT_FUNC(avx)
LFE_FIR_FLOAT_FUNC(fma3)
@ -38,8 +41,12 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
if (EXTERNAL_AVX(cpu_flags))
if (EXTERNAL_SSE3(cpu_flags))
s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3;
if (EXTERNAL_AVX(cpu_flags)) {
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
s->lfe_fir_float[1] = ff_lfe_fir1_float_avx;
}
if (EXTERNAL_FMA3(cpu_flags))
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
}