diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index fb13957044..c5bf21aebd 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -201,3 +201,82 @@ LFE_FIR0_FLOAT INIT_XMM fma3 LFE_FIR0_FLOAT %endif + +%macro LFE_FIR1_FLOAT 0 +cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2 + shr nblocksd, 2 + sub lfeq, 3*sizeof_float + mov cnt1d, 64*sizeof_float + mov cnt2d, 64*sizeof_float-16 + lea coeffq, [coeffq+cnt1q*4] + add samplesq, cnt1q + neg cnt1q + +.loop: +%if cpuflag(avx) + cvtdq2ps m4, [lfeq] + shufps m5, m4, m4, q0123 +%elif cpuflag(sse2) + movu m4, [lfeq] + cvtdq2ps m4, m4 + pshufd m5, m4, q0123 +%endif + +.inner_loop: + movaps m6, [coeffq+cnt1q*4 ] + movaps m7, [coeffq+cnt1q*4+16] + mulps m0, m5, m6 + mulps m1, m5, m7 +%if ARCH_X86_64 + movaps m8, [coeffq+cnt1q*4+32] + movaps m9, [coeffq+cnt1q*4+48] + mulps m2, m5, m8 + mulps m3, m5, m9 +%else + mulps m2, m5, [coeffq+cnt1q*4+32] + mulps m3, m5, [coeffq+cnt1q*4+48] +%endif + + haddps m0, m1 + haddps m2, m3 + haddps m0, m2 + movaps [samplesq+cnt1q], m0 + + mulps m6, m4 + mulps m7, m4 +%if ARCH_X86_64 + mulps m8, m4 + mulps m9, m4 + + haddps m6, m7 + haddps m8, m9 + haddps m6, m8 +%else + mulps m2, m4, [coeffq+cnt1q*4+32] + mulps m3, m4, [coeffq+cnt1q*4+48] + + haddps m6, m7 + haddps m2, m3 + haddps m6, m2 +%endif + movaps [samplesq+cnt2q], m6 + + sub cnt2d, 16 + add cnt1q, 16 + jl .inner_loop + + add lfeq, sizeof_float + add samplesq, 128*sizeof_float + mov cnt1q, -64*sizeof_float + mov cnt2d, 64*sizeof_float-16 + sub nblocksd, 1 + jg .loop + RET +%endmacro + +INIT_XMM sse3 +LFE_FIR1_FLOAT +%if HAVE_AVX_EXTERNAL +INIT_XMM avx +LFE_FIR1_FLOAT +%endif diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index bfe13e5a71..fc10fb8bc5 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -23,10 +23,13 @@ #define LFE_FIR_FLOAT_FUNC(opt) \ void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \ + const float *filter_coeff, ptrdiff_t npcmblocks); \ +void ff_lfe_fir1_float_##opt(float *pcm_samples, int32_t *lfe_samples, \ const float *filter_coeff, ptrdiff_t npcmblocks); LFE_FIR_FLOAT_FUNC(sse) LFE_FIR_FLOAT_FUNC(sse2) +LFE_FIR_FLOAT_FUNC(sse3) LFE_FIR_FLOAT_FUNC(avx) LFE_FIR_FLOAT_FUNC(fma3) @@ -38,8 +41,12 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) s->lfe_fir_float[0] = ff_lfe_fir0_float_sse; if (EXTERNAL_SSE2(cpu_flags)) s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2; - if (EXTERNAL_AVX(cpu_flags)) + if (EXTERNAL_SSE3(cpu_flags)) + s->lfe_fir_float[1] = ff_lfe_fir1_float_sse3; + if (EXTERNAL_AVX(cpu_flags)) { s->lfe_fir_float[0] = ff_lfe_fir0_float_avx; + s->lfe_fir_float[1] = ff_lfe_fir1_float_avx; + } if (EXTERNAL_FMA3(cpu_flags)) s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3; }