mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-13 19:01:03 +00:00
x86/dcadec: add ff_lfe_fir0_float_{sse,sse2,avx,fma3}
Up to ~4 times faster on x86_64, ~8 times on x86_32 if compiling using x87 fp math. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
3e9b8ffc9b
commit
8ae7447941
@ -410,4 +410,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
|
||||
s->dmix_scale_inv = dmix_scale_inv_c;
|
||||
|
||||
s->assemble_freq_bands = assemble_freq_bands_c;
|
||||
|
||||
if (ARCH_X86)
|
||||
ff_dcadsp_init_x86(s);
|
||||
}
|
||||
|
@ -87,5 +87,6 @@ typedef struct DCADSPContext {
|
||||
} DCADSPContext;
|
||||
|
||||
av_cold void ff_dcadsp_init(DCADSPContext *s);
|
||||
av_cold void ff_dcadsp_init_x86(DCADSPContext *s);
|
||||
|
||||
#endif
|
||||
|
@ -46,7 +46,7 @@ OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o
|
||||
OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp_init.o
|
||||
OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o
|
||||
OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o
|
||||
OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o
|
||||
@ -133,7 +133,7 @@ YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o
|
||||
YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
|
||||
YASM-OBJS-$(CONFIG_ALAC_DECODER) += x86/alacdsp.o
|
||||
YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o
|
||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/synth_filter.o
|
||||
YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o
|
||||
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \
|
||||
x86/dirac_dwt.o
|
||||
YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
|
203
libavcodec/x86/dcadsp.asm
Normal file
203
libavcodec/x86/dcadsp.asm
Normal file
@ -0,0 +1,203 @@
|
||||
;******************************************************************************
|
||||
;* SIMD-optimized functions for the DCA decoder
|
||||
;* Copyright (C) 2016 James Almer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define sizeof_float 4
|
||||
%define FMA3_OFFSET (8 * cpuflag(fma3) * ARCH_X86_64)
|
||||
|
||||
%macro LFE_FIR0_FLOAT 0
|
||||
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
|
||||
shr nblocksd, 1
|
||||
sub lfeq, 7*sizeof_float
|
||||
mov cnt1d, 32*sizeof_float
|
||||
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
|
||||
lea coeffq, [coeffq+cnt1q*8]
|
||||
add samplesq, cnt1q
|
||||
neg cnt1q
|
||||
|
||||
.loop:
|
||||
%if cpuflag(avx)
|
||||
cvtdq2ps m4, [lfeq+16]
|
||||
cvtdq2ps m5, [lfeq ]
|
||||
shufps m7, m4, m4, q0123
|
||||
shufps m6, m5, m5, q0123
|
||||
%elif cpuflag(sse2)
|
||||
movu m4, [lfeq+16]
|
||||
movu m5, [lfeq ]
|
||||
cvtdq2ps m4, m4
|
||||
cvtdq2ps m5, m5
|
||||
pshufd m7, m4, q0123
|
||||
pshufd m6, m5, q0123
|
||||
%else
|
||||
cvtpi2ps m4, [lfeq+16]
|
||||
cvtpi2ps m0, [lfeq+24]
|
||||
cvtpi2ps m5, [lfeq ]
|
||||
cvtpi2ps m1, [lfeq+8 ]
|
||||
shufps m4, m0, q1010
|
||||
shufps m5, m1, q1010
|
||||
shufps m7, m4, m4, q0123
|
||||
shufps m6, m5, m5, q0123
|
||||
%endif
|
||||
|
||||
.inner_loop:
|
||||
%if ARCH_X86_64
|
||||
movaps m8, [coeffq+cnt1q*8 ]
|
||||
movaps m9, [coeffq+cnt1q*8+16]
|
||||
movaps m10, [coeffq+cnt1q*8+32]
|
||||
movaps m11, [coeffq+cnt1q*8+48]
|
||||
%if cpuflag(fma3)
|
||||
movaps m12, [coeffq+cnt1q*8+64]
|
||||
movaps m13, [coeffq+cnt1q*8+80]
|
||||
movaps m14, [coeffq+cnt1q*8+96]
|
||||
movaps m15, [coeffq+cnt1q*8+112]
|
||||
mulps m0, m7, m8
|
||||
mulps m1, m7, m10
|
||||
mulps m2, m7, m12
|
||||
mulps m3, m7, m14
|
||||
fmaddps m0, m6, m9, m0
|
||||
fmaddps m1, m6, m11, m1
|
||||
fmaddps m2, m6, m13, m2
|
||||
fmaddps m3, m6, m15, m3
|
||||
|
||||
haddps m0, m1
|
||||
haddps m2, m3
|
||||
haddps m0, m2
|
||||
movaps [samplesq+cnt1q], m0
|
||||
%else
|
||||
mulps m0, m7, m8
|
||||
mulps m1, m6, m9
|
||||
mulps m2, m7, m10
|
||||
mulps m3, m6, m11
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
|
||||
unpckhps m3, m0, m2
|
||||
unpcklps m0, m2
|
||||
addps m3, m0
|
||||
movhlps m2, m3
|
||||
addps m2, m3
|
||||
movlps [samplesq+cnt1q], m2
|
||||
%endif
|
||||
%else ; ARCH_X86_32
|
||||
%if cpuflag(fma3)
|
||||
mulps m0, m7, [coeffq+cnt1q*8 ]
|
||||
movaps m1, [coeffq+cnt1q*8+16]
|
||||
mulps m2, m7, [coeffq+cnt1q*8+32]
|
||||
fmaddps m0, m6, m1, m0
|
||||
fmaddps m2, m6, [coeffq+cnt1q*8+48], m2
|
||||
%else
|
||||
mulps m0, m7, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m6, [coeffq+cnt1q*8+16]
|
||||
mulps m2, m7, [coeffq+cnt1q*8+32]
|
||||
mulps m3, m6, [coeffq+cnt1q*8+48]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
%endif
|
||||
unpckhps m3, m0, m2
|
||||
unpcklps m0, m2
|
||||
addps m3, m0
|
||||
movhlps m2, m3
|
||||
addps m2, m3
|
||||
movlps [samplesq+cnt1q], m2
|
||||
%endif; ARCH
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if cpuflag(fma3)
|
||||
mulps m8, m5
|
||||
mulps m10, m5
|
||||
mulps m12, m5
|
||||
mulps m14, m5
|
||||
fmaddps m8, m4, m9, m8
|
||||
fmaddps m10, m4, m11, m10
|
||||
fmaddps m12, m4, m13, m12
|
||||
fmaddps m14, m4, m15, m14
|
||||
|
||||
haddps m10, m8
|
||||
haddps m14, m12
|
||||
haddps m14, m10
|
||||
movaps [samplesq+cnt2q], m14
|
||||
%else
|
||||
mulps m8, m5
|
||||
mulps m9, m4
|
||||
mulps m10, m5
|
||||
mulps m11, m4
|
||||
addps m8, m9
|
||||
addps m10, m11
|
||||
|
||||
unpckhps m11, m10, m8
|
||||
unpcklps m10, m8
|
||||
addps m11, m10
|
||||
movhlps m8, m11
|
||||
addps m8, m11
|
||||
movlps [samplesq+cnt2q], m8
|
||||
%endif
|
||||
%else ; ARCH_X86_32
|
||||
%if cpuflag(fma3)
|
||||
mulps m0, m5, [coeffq+cnt1q*8 ]
|
||||
mulps m2, m5, [coeffq+cnt1q*8+32]
|
||||
fmaddps m0, m4, m1, m0
|
||||
fmaddps m2, m4, [coeffq+cnt1q*8+48], m2
|
||||
%else
|
||||
mulps m0, m5, [coeffq+cnt1q*8 ]
|
||||
mulps m1, m4, [coeffq+cnt1q*8+16]
|
||||
mulps m2, m5, [coeffq+cnt1q*8+32]
|
||||
mulps m3, m4, [coeffq+cnt1q*8+48]
|
||||
addps m0, m1
|
||||
addps m2, m3
|
||||
%endif
|
||||
unpckhps m3, m2, m0
|
||||
unpcklps m2, m0
|
||||
addps m3, m2
|
||||
movhlps m0, m3
|
||||
addps m0, m3
|
||||
movlps [samplesq+cnt2q], m0
|
||||
%endif; ARCH
|
||||
|
||||
sub cnt2d, 8 + FMA3_OFFSET
|
||||
add cnt1q, 8 + FMA3_OFFSET
|
||||
jl .inner_loop
|
||||
|
||||
add lfeq, 4
|
||||
add samplesq, 64*sizeof_float
|
||||
mov cnt1q, -32*sizeof_float
|
||||
mov cnt2d, 32*sizeof_float-8-FMA3_OFFSET
|
||||
sub nblocksd, 1
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_XMM sse
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
LFE_FIR0_FLOAT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
||||
%if HAVE_FMA3_EXTERNAL
|
||||
INIT_XMM fma3
|
||||
LFE_FIR0_FLOAT
|
||||
%endif
|
45
libavcodec/x86/dcadsp_init.c
Normal file
45
libavcodec/x86/dcadsp_init.c
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
#define LFE_FIR_FLOAT_FUNC(opt) \
|
||||
void ff_lfe_fir0_float_##opt(float *pcm_samples, int32_t *lfe_samples, \
|
||||
const float *filter_coeff, ptrdiff_t npcmblocks);
|
||||
|
||||
LFE_FIR_FLOAT_FUNC(sse)
|
||||
LFE_FIR_FLOAT_FUNC(sse2)
|
||||
LFE_FIR_FLOAT_FUNC(avx)
|
||||
LFE_FIR_FLOAT_FUNC(fma3)
|
||||
|
||||
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (ARCH_X86_32 && EXTERNAL_SSE(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse;
|
||||
if (EXTERNAL_SSE2(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_sse2;
|
||||
if (EXTERNAL_AVX(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_avx;
|
||||
if (EXTERNAL_FMA3(cpu_flags))
|
||||
s->lfe_fir_float[0] = ff_lfe_fir0_float_fma3;
|
||||
}
|
Loading…
Reference in New Issue
Block a user