sbcenc: add MMX optimizations

This was originally based on libsbc, and was fully integrated into ffmpeg.

Rough speed test:
C version:    speed= 592x
MMX version:  speed= 785x
This commit is contained in:
Aurelien Jacobs 2017-12-17 20:07:33 +01:00
parent 88508a87a5
commit f1e490b1ad
5 changed files with 226 additions and 0 deletions

View File

@ -379,4 +379,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
/* Default implementation for scale factors calculation */
s->sbc_calc_scalefactors = sbc_calc_scalefactors;
s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
if (ARCH_X86)
ff_sbcdsp_init_x86(s);
}

View File

@ -80,4 +80,6 @@ struct sbc_dsp_context {
*/
void ff_sbcdsp_init(SBCDSPContext *s);
void ff_sbcdsp_init_x86(SBCDSPContext *s);
#endif /* AVCODEC_SBCDSP_H */

View File

@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o

168
libavcodec/x86/sbcdsp.asm Normal file
View File

@ -0,0 +1,168 @@
;******************************************************************************
;* SIMD optimized SBC encoder DSP functions
;*
;* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
;* Copyright (C) 2008-2010 Nokia Corporation
;* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
;* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
;* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
SECTION .text
%macro NIDN 3
%ifnidn %2, %3
%1 %2, %3
%endif
%endmacro
%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
NIDN movq, %5, %3
NIDN movq, %6, %4
pmaddwd %5, [constsq+%9]
pmaddwd %6, [constsq+%9+8]
NIDN paddd, %1, %7
NIDN paddd, %2, %8
%endmacro
%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
%endmacro
%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
%ifidn %7, pack
psrad %3, 16 ; SBC_PROTO_FIXED_SCALE
packssdw %3, %3
%endif
ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6
%endmacro
;*******************************************************************
;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48
ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64
ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack
ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack
movq [outq ], m0
movq [outq+8], m2
RET
;*******************************************************************
;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
;*******************************************************************
INIT_MMX mmx
cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0
ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112
ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128
ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144
ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack
ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack
ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack
ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack
movq [outq ], m4
movq [outq+8], m5
ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no
ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no
ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no
ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no
movq [outq+16], m0
movq [outq+24], m5
RET
;*******************************************************************
;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
; uint32_t scale_factor[2][8],
; int blocks, int channels, int subbands)
;*******************************************************************
INIT_MMX mmx
cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
; subbands = 4 * subbands * channels
movq m3, [scale_mask]
shl subbandsd, 2
cmp channelsd, 2
jl .loop_1
shl subbandsd, 1
.loop_1:
sub subbandsq, 8
lea ptrq, [sb_sample_fq + subbandsq]
; blk = (blocks - 1) * 64;
lea blkq, [blocksq - 1]
shl blkd, 6
movq m0, m3
.loop_2:
movq m1, [ptrq+blkq]
pxor m2, m2
pcmpgtd m1, m2
paddd m1, [ptrq+blkq]
pcmpgtd m2, m1
pxor m1, m2
por m0, m1
sub blkq, 64
jns .loop_2
movd blkd, m0
psrlq m0, 32
bsr blkd, blkd
sub blkd, 15 ; SCALE_OUT_BITS
mov [scale_factorq + subbandsq], blkd
movd blkd, m0
bsr blkd, blkd
sub blkd, 15 ; SCALE_OUT_BITS
mov [scale_factorq + subbandsq + 4], blkd
cmp subbandsq, 0
jg .loop_1
emms
RET

View File

@ -0,0 +1,51 @@
/*
* Bluetooth low-complexity, subband codec (SBC)
*
* Copyright (C) 2017 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2008-2010 Nokia Corporation
* Copyright (C) 2004-2010 Marcel Holtmann <marcel@holtmann.org>
* Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
* Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* SBC MMX optimization for some basic "building bricks"
*/
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbcdsp.h"
void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
}
}