From f677718bc87a96c05edda25be7e8c9b9dc357f5d Mon Sep 17 00:00:00 2001 From: Aurelien Jacobs Date: Sun, 17 Dec 2017 20:10:32 +0100 Subject: [PATCH] sbcenc: add armv6 and neon asm optimizations This was originally based on libsbc, and was fully integrated into ffmpeg. --- libavcodec/arm/Makefile | 3 + libavcodec/arm/sbcdsp_armv6.S | 245 +++++++++++ libavcodec/arm/sbcdsp_init_arm.c | 105 +++++ libavcodec/arm/sbcdsp_neon.S | 714 +++++++++++++++++++++++++++++++ libavcodec/sbcdsp.c | 2 + libavcodec/sbcdsp.h | 1 + 6 files changed, 1070 insertions(+) create mode 100644 libavcodec/arm/sbcdsp_armv6.S create mode 100644 libavcodec/arm/sbcdsp_init_arm.c create mode 100644 libavcodec/arm/sbcdsp_neon.S diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 1eeac5449e..fd2401f4e5 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -42,6 +42,7 @@ OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o +OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o \ @@ -81,6 +82,7 @@ ARMV6-OBJS-$(CONFIG_VP8DSP) += arm/vp8_armv6.o \ # decoders/encoders ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o +ARMV6-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_armv6.o # VFP optimizations @@ -140,6 +142,7 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o +NEON-OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o NEON-OBJS-$(CONFIG_VP9_DECODER) += arm/vp9itxfm_16bpp_neon.o \ diff --git a/libavcodec/arm/sbcdsp_armv6.S b/libavcodec/arm/sbcdsp_armv6.S new file mode 100644 index 0000000000..f1ff845798 --- /dev/null +++ b/libavcodec/arm/sbcdsp_armv6.S @@ -0,0 +1,245 @@ +/* + * Bluetooth low-complexity, subband codec (SBC) + * + * Copyright (C) 2017 Aurelien Jacobs + * Copyright (C) 2008-2010 Nokia Corporation + * Copyright (C) 2004-2010 Marcel Holtmann + * Copyright (C) 2004-2005 Henryk Ploetz + * Copyright (C) 2005-2006 Brad Midgley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline. + */ + +#include "libavutil/arm/asm.S" + +function ff_sbc_analyze_4_armv6, export=1 + @ r0 = in, r1 = out, r2 = consts + push {r1, r3-r7, lr} + push {r8-r12, r14} + ldrd r4, r5, [r0, #0] + ldrd r6, r7, [r2, #0] + ldrd r8, r9, [r0, #16] + ldrd r10, r11, [r2, #16] + mov r14, #0x8000 + smlad r3, r4, r6, r14 + smlad r12, r5, r7, r14 + ldrd r4, r5, [r0, #32] + ldrd r6, r7, [r2, #32] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #48] + ldrd r10, r11, [r2, #48] + smlad r3, r4, r6, r3 + smlad r12, r5, r7, r12 + ldrd r4, r5, [r0, #64] + ldrd r6, r7, [r2, #64] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #8] + ldrd r10, r11, [r2, #8] + smlad r3, r4, r6, r3 @ t1[0] is done + smlad r12, r5, r7, r12 @ t1[1] is done + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r2, #24] + pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1] + smlad r12, r8, r10, r14 + smlad r14, r9, r11, r14 + ldrd r8, r9, [r0, #40] + ldrd r10, r11, [r2, #40] + smlad r12, r4, r6, r12 + smlad r14, r5, r7, r14 + ldrd r4, r5, [r0, #56] + ldrd r6, r7, [r2, #56] + smlad r12, r8, r10, r12 + smlad r14, r9, r11, r14 + ldrd r8, r9, [r0, #72] + ldrd r10, r11, [r2, #72] + smlad r12, r4, r6, r12 + smlad r14, r5, r7, r14 + ldrd r4, r5, [r2, #80] @ start loading cos table + smlad r12, r8, r10, r12 @ t1[2] is done + smlad r14, r9, r11, r14 @ t1[3] is done + ldrd r6, r7, [r2, #88] + ldrd r8, r9, [r2, #96] + ldrd r10, r11, [r2, #104] @ cos table fully loaded + pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3] + smuad r4, r3, r4 + smuad r5, r3, r5 + smlad r4, r12, r8, r4 + smlad r5, r12, r9, r5 + smuad r6, r3, r6 + smuad r7, r3, r7 + smlad r6, r12, r10, r6 + smlad r7, r12, r11, r7 + pop {r8-r12, r14} + stmia r1, {r4, r5, r6, r7} + pop {r1, r3-r7, pc} +endfunc + +function ff_sbc_analyze_8_armv6, export=1 + @ r0 = in, r1 = out, r2 = consts + push {r1, r3-r7, lr} + push {r8-r12, r14} + ldrd r4, r5, [r0, #24] + ldrd r6, r7, [r2, #24] + ldrd r8, r9, [r0, #56] + ldrd r10, r11, [r2, #56] + mov r14, #0x8000 + smlad r3, r4, r6, r14 + smlad r12, r5, r7, r14 + ldrd r4, r5, [r0, #88] + ldrd r6, r7, [r2, #88] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #120] + ldrd r10, r11, [r2, #120] + smlad r3, r4, r6, r3 + smlad r12, r5, r7, r12 + ldrd r4, r5, [r0, #152] + ldrd r6, r7, [r2, #152] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #16] + ldrd r10, r11, [r2, #16] + smlad r3, r4, r6, r3 @ t1[6] is done + smlad r12, r5, r7, r12 @ t1[7] is done + ldrd r4, r5, [r0, #48] + ldrd r6, r7, [r2, #48] + pkhtb r3, r12, r3, asr #16 @ combine t1[6] and t1[7] + str r3, [sp, #-4]! @ save to stack + smlad r3, r8, r10, r14 + smlad r12, r9, r11, r14 + ldrd r8, r9, [r0, #80] + ldrd r10, r11, [r2, #80] + smlad r3, r4, r6, r3 + smlad r12, r5, r7, r12 + ldrd r4, r5, [r0, #112] + ldrd r6, r7, [r2, #112] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #144] + ldrd r10, r11, [r2, #144] + smlad r3, r4, r6, r3 + smlad r12, r5, r7, r12 + ldrd r4, r5, [r0, #0] + ldrd r6, r7, [r2, #0] + smlad r3, r8, r10, r3 @ t1[4] is done + smlad r12, r9, r11, r12 @ t1[5] is done + ldrd r8, r9, [r0, #32] + ldrd r10, r11, [r2, #32] + pkhtb r3, r12, r3, asr #16 @ combine t1[4] and t1[5] + str r3, [sp, #-4]! @ save to stack + smlad r3, r4, r6, r14 + smlad r12, r5, r7, r14 + ldrd r4, r5, [r0, #64] + ldrd r6, r7, [r2, #64] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #96] + ldrd r10, r11, [r2, #96] + smlad r3, r4, r6, r3 + smlad r12, r5, r7, r12 + ldrd r4, r5, [r0, #128] + ldrd r6, r7, [r2, #128] + smlad r3, r8, r10, r3 + smlad r12, r9, r11, r12 + ldrd r8, r9, [r0, #8] + ldrd r10, r11, [r2, #8] + smlad r3, r4, r6, r3 @ t1[0] is done + smlad r12, r5, r7, r12 @ t1[1] is done + ldrd r4, r5, [r0, #40] + ldrd r6, r7, [r2, #40] + pkhtb r3, r12, r3, asr #16 @ combine t1[0] and t1[1] + smlad r12, r8, r10, r14 + smlad r14, r9, r11, r14 + ldrd r8, r9, [r0, #72] + ldrd r10, r11, [r2, #72] + smlad r12, r4, r6, r12 + smlad r14, r5, r7, r14 + ldrd r4, r5, [r0, #104] + ldrd r6, r7, [r2, #104] + smlad r12, r8, r10, r12 + smlad r14, r9, r11, r14 + ldrd r8, r9, [r0, #136] + ldrd r10, r11, [r2, #136]! + smlad r12, r4, r6, r12 + smlad r14, r5, r7, r14 + ldrd r4, r5, [r2, #(160 - 136 + 0)] + smlad r12, r8, r10, r12 @ t1[2] is done + smlad r14, r9, r11, r14 @ t1[3] is done + ldrd r6, r7, [r2, #(160 - 136 + 8)] + smuad r4, r3, r4 + smuad r5, r3, r5 + pkhtb r12, r14, r12, asr #16 @ combine t1[2] and t1[3] + @ r3 = t2[0:1] + @ r12 = t2[2:3] + pop {r0, r14} @ t2[4:5], t2[6:7] + ldrd r8, r9, [r2, #(160 - 136 + 32)] + smuad r6, r3, r6 + smuad r7, r3, r7 + ldrd r10, r11, [r2, #(160 - 136 + 40)] + smlad r4, r12, r8, r4 + smlad r5, r12, r9, r5 + ldrd r8, r9, [r2, #(160 - 136 + 64)] + smlad r6, r12, r10, r6 + smlad r7, r12, r11, r7 + ldrd r10, r11, [r2, #(160 - 136 + 72)] + smlad r4, r0, r8, r4 + smlad r5, r0, r9, r5 + ldrd r8, r9, [r2, #(160 - 136 + 96)] + smlad r6, r0, r10, r6 + smlad r7, r0, r11, r7 + ldrd r10, r11, [r2, #(160 - 136 + 104)] + smlad r4, r14, r8, r4 + smlad r5, r14, r9, r5 + ldrd r8, r9, [r2, #(160 - 136 + 16 + 0)] + smlad r6, r14, r10, r6 + smlad r7, r14, r11, r7 + ldrd r10, r11, [r2, #(160 - 136 + 16 + 8)] + stmia r1!, {r4, r5} + smuad r4, r3, r8 + smuad r5, r3, r9 + ldrd r8, r9, [r2, #(160 - 136 + 16 + 32)] + stmia r1!, {r6, r7} + smuad r6, r3, r10 + smuad r7, r3, r11 + ldrd r10, r11, [r2, #(160 - 136 + 16 + 40)] + smlad r4, r12, r8, r4 + smlad r5, r12, r9, r5 + ldrd r8, r9, [r2, #(160 - 136 + 16 + 64)] + smlad r6, r12, r10, r6 + smlad r7, r12, r11, r7 + ldrd r10, r11, [r2, #(160 - 136 + 16 + 72)] + smlad r4, r0, r8, r4 + smlad r5, r0, r9, r5 + ldrd r8, r9, [r2, #(160 - 136 + 16 + 96)] + smlad r6, r0, r10, r6 + smlad r7, r0, r11, r7 + ldrd r10, r11, [r2, #(160 - 136 + 16 + 104)] + smlad r4, r14, r8, r4 + smlad r5, r14, r9, r5 + smlad r6, r14, r10, r6 + smlad r7, r14, r11, r7 + pop {r8-r12, r14} + stmia r1!, {r4, r5, r6, r7} + pop {r1, r3-r7, pc} +endfunc diff --git a/libavcodec/arm/sbcdsp_init_arm.c b/libavcodec/arm/sbcdsp_init_arm.c new file mode 100644 index 0000000000..6bf7e729ef --- /dev/null +++ b/libavcodec/arm/sbcdsp_init_arm.c @@ -0,0 +1,105 @@ +/* + * Bluetooth low-complexity, subband codec (SBC) + * + * Copyright (C) 2017 Aurelien Jacobs + * Copyright (C) 2008-2010 Nokia Corporation + * Copyright (C) 2004-2010 Marcel Holtmann + * Copyright (C) 2004-2005 Henryk Ploetz + * Copyright (C) 2005-2006 Brad Midgley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * SBC ARMv6 optimization for some basic "building bricks" + */ + +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/sbcdsp.h" + +void ff_sbc_analyze_4_armv6(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_8_armv6(const int16_t *in, int32_t *out, const int16_t *consts); + +void ff_sbc_analyze_4_neon(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_8_neon(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_calc_scalefactors_neon(int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands); +int ff_sbc_calc_scalefactors_j_neon(int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int subbands); +int ff_sbc_enc_process_input_4s_neon(int position, const uint8_t *pcm, + int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); +int ff_sbc_enc_process_input_8s_neon(int position, const uint8_t *pcm, + int16_t X[2][SBC_X_BUFFER_SIZE], + int nsamples, int nchannels); + +DECLARE_ALIGNED(SBC_ALIGN, int32_t, ff_sbcdsp_joint_bits_mask)[8] = { + 8, 4, 2, 1, 128, 64, 32, 16 +}; + +#if HAVE_BIGENDIAN +#define PERM(a, b, c, d) { \ + (a * 2) + 1, (a * 2) + 0, \ + (b * 2) + 1, (b * 2) + 0, \ + (c * 2) + 1, (c * 2) + 0, \ + (d * 2) + 1, (d * 2) + 0 \ + } +#else +#define PERM(a, b, c, d) { \ + (a * 2) + 0, (a * 2) + 1, \ + (b * 2) + 0, (b * 2) + 1, \ + (c * 2) + 0, (c * 2) + 1, \ + (d * 2) + 0, (d * 2) + 1 \ + } +#endif + +DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_4)[2][8] = { + PERM(7, 3, 6, 4), + PERM(0, 2, 1, 5) +}; + +DECLARE_ALIGNED(SBC_ALIGN, uint8_t, ff_sbc_input_perm_8)[4][8] = { + PERM(15, 7, 14, 8), + PERM(13, 9, 12, 10), + PERM(11, 3, 6, 0), + PERM( 5, 1, 4, 2) +}; + +av_cold void ff_sbcdsp_init_arm(SBCDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + s->sbc_analyze_4 = ff_sbc_analyze_4_armv6; + s->sbc_analyze_8 = ff_sbc_analyze_8_armv6; + } + + if (have_neon(cpu_flags)) { + s->sbc_analyze_4 = ff_sbc_analyze_4_neon; + s->sbc_analyze_8 = ff_sbc_analyze_8_neon; + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_neon; + s->sbc_calc_scalefactors_j = ff_sbc_calc_scalefactors_j_neon; + if (s->increment != 1) { + s->sbc_enc_process_input_4s = ff_sbc_enc_process_input_4s_neon; + s->sbc_enc_process_input_8s = ff_sbc_enc_process_input_8s_neon; + } + } +} diff --git a/libavcodec/arm/sbcdsp_neon.S b/libavcodec/arm/sbcdsp_neon.S new file mode 100644 index 0000000000..d83d21d202 --- /dev/null +++ b/libavcodec/arm/sbcdsp_neon.S @@ -0,0 +1,714 @@ +/* + * Bluetooth low-complexity, subband codec (SBC) + * + * Copyright (C) 2017 Aurelien Jacobs + * Copyright (C) 2008-2010 Nokia Corporation + * Copyright (C) 2004-2010 Marcel Holtmann + * Copyright (C) 2004-2005 Henryk Ploetz + * Copyright (C) 2005-2006 Brad Midgley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * SBC ARM NEON optimizations + */ + +#include "libavutil/arm/asm.S" +#include "neon.S" + +#define SBC_PROTO_FIXED_SCALE 16 + +function ff_sbc_analyze_4_neon, export=1 + /* TODO: merge even and odd cases (or even merge all four calls to this + * function) in order to have only aligned reads from 'in' array + * and reduce number of load instructions */ + vld1.16 {d4, d5}, [r0, :64]! + vld1.16 {d8, d9}, [r2, :128]! + + vmull.s16 q0, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmull.s16 q1, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + + vmlal.s16 q0, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmlal.s16 q1, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q0, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmlal.s16 q1, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + + vmlal.s16 q0, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmlal.s16 q1, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q0, d4, d8 + vmlal.s16 q1, d5, d9 + + vpadd.s32 d0, d0, d1 + vpadd.s32 d1, d2, d3 + + vrshrn.s32 d0, q0, SBC_PROTO_FIXED_SCALE + + vld1.16 {d2, d3, d4, d5}, [r2, :128]! + + vdup.i32 d1, d0[1] /* TODO: can be eliminated */ + vdup.i32 d0, d0[0] /* TODO: can be eliminated */ + + vmull.s16 q3, d2, d0 + vmull.s16 q4, d3, d0 + vmlal.s16 q3, d4, d1 + vmlal.s16 q4, d5, d1 + + vpadd.s32 d0, d6, d7 /* TODO: can be eliminated */ + vpadd.s32 d1, d8, d9 /* TODO: can be eliminated */ + + vst1.32 {d0, d1}, [r1, :128] + + bx lr +endfunc + +function ff_sbc_analyze_8_neon, export=1 + /* TODO: merge even and odd cases (or even merge all four calls to this + * function) in order to have only aligned reads from 'in' array + * and reduce number of load instructions */ + vld1.16 {d4, d5}, [r0, :64]! + vld1.16 {d8, d9}, [r2, :128]! + + vmull.s16 q6, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmull.s16 q7, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + vmull.s16 q8, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmull.s16 q9, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q6, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmlal.s16 q7, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + vmlal.s16 q8, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmlal.s16 q9, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q6, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmlal.s16 q7, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + vmlal.s16 q8, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmlal.s16 q9, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q6, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmlal.s16 q7, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + vmlal.s16 q8, d6, d10 + vld1.16 {d4, d5}, [r0, :64]! + vmlal.s16 q9, d7, d11 + vld1.16 {d8, d9}, [r2, :128]! + + vmlal.s16 q6, d4, d8 + vld1.16 {d6, d7}, [r0, :64]! + vmlal.s16 q7, d5, d9 + vld1.16 {d10, d11}, [r2, :128]! + + vmlal.s16 q8, d6, d10 + vmlal.s16 q9, d7, d11 + + vpadd.s32 d0, d12, d13 + vpadd.s32 d1, d14, d15 + vpadd.s32 d2, d16, d17 + vpadd.s32 d3, d18, d19 + + vrshr.s32 q0, q0, SBC_PROTO_FIXED_SCALE + vrshr.s32 q1, q1, SBC_PROTO_FIXED_SCALE + vmovn.s32 d0, q0 + vmovn.s32 d1, q1 + + vdup.i32 d3, d1[1] /* TODO: can be eliminated */ + vdup.i32 d2, d1[0] /* TODO: can be eliminated */ + vdup.i32 d1, d0[1] /* TODO: can be eliminated */ + vdup.i32 d0, d0[0] /* TODO: can be eliminated */ + + vld1.16 {d4, d5}, [r2, :128]! + vmull.s16 q6, d4, d0 + vld1.16 {d6, d7}, [r2, :128]! + vmull.s16 q7, d5, d0 + vmull.s16 q8, d6, d0 + vmull.s16 q9, d7, d0 + + vld1.16 {d4, d5}, [r2, :128]! + vmlal.s16 q6, d4, d1 + vld1.16 {d6, d7}, [r2, :128]! + vmlal.s16 q7, d5, d1 + vmlal.s16 q8, d6, d1 + vmlal.s16 q9, d7, d1 + + vld1.16 {d4, d5}, [r2, :128]! + vmlal.s16 q6, d4, d2 + vld1.16 {d6, d7}, [r2, :128]! + vmlal.s16 q7, d5, d2 + vmlal.s16 q8, d6, d2 + vmlal.s16 q9, d7, d2 + + vld1.16 {d4, d5}, [r2, :128]! + vmlal.s16 q6, d4, d3 + vld1.16 {d6, d7}, [r2, :128]! + vmlal.s16 q7, d5, d3 + vmlal.s16 q8, d6, d3 + vmlal.s16 q9, d7, d3 + + vpadd.s32 d0, d12, d13 /* TODO: can be eliminated */ + vpadd.s32 d1, d14, d15 /* TODO: can be eliminated */ + vpadd.s32 d2, d16, d17 /* TODO: can be eliminated */ + vpadd.s32 d3, d18, d19 /* TODO: can be eliminated */ + + vst1.32 {d0, d1, d2, d3}, [r1, :128] + + bx lr +endfunc + +function ff_sbc_calc_scalefactors_neon, export=1 + @ parameters + @ r0 = sb_sample_f + @ r1 = scale_factor + @ r2 = blocks + @ r3 = channels + @ r4 = subbands + @ local variables + @ r5 = in_loop_1 + @ r6 = in + @ r7 = out_loop_1 + @ r8 = out + @ r9 = ch + @ r10 = sb + @ r11 = inc + @ r12 = blk + + push {r1-r2, r4-r12} + ldr r4, [sp, #44] + mov r11, #64 + + mov r9, #0 +1: + add r5, r0, r9, lsl#5 + add r7, r1, r9, lsl#5 + + mov r10, #0 +2: + add r6, r5, r10, lsl#2 + add r8, r7, r10, lsl#2 + mov r12, r2 + + vmov.s32 q0, #0 + vmov.s32 q1, #0x8000 @ 1 << SCALE_OUT_BITS + vmov.s32 q14, #1 + vmov.s32 q15, #16 @ 31 - SCALE_OUT_BITS + vadd.s32 q1, q1, q14 +3: + vld1.32 {d16, d17}, [r6, :128], r11 + vabs.s32 q8, q8 + vld1.32 {d18, d19}, [r6, :128], r11 + vabs.s32 q9, q9 + vld1.32 {d20, d21}, [r6, :128], r11 + vabs.s32 q10, q10 + vld1.32 {d22, d23}, [r6, :128], r11 + vabs.s32 q11, q11 + vmax.s32 q0, q0, q8 + vmax.s32 q1, q1, q9 + vmax.s32 q0, q0, q10 + vmax.s32 q1, q1, q11 + subs r12, r12, #4 + bgt 3b + vmax.s32 q0, q0, q1 + vsub.s32 q0, q0, q14 + vclz.s32 q0, q0 + vsub.s32 q0, q15, q0 + vst1.32 {d0, d1}, [r8, :128] + + add r10, r10, #4 + cmp r10, r4 + blt 2b + + add r9, r9, #1 + cmp r9, r3 + blt 1b + + pop {r1-r2, r4-r12} + bx lr +endfunc + +/* + * constants: q13 = (31 - SCALE_OUT_BITS) + * q14 = 1 + * input: q0 - ((1 << SCALE_OUT_BITS) + 1) + * r5 - samples for channel 0 + * r6 - samples for shannel 1 + * output: q0, q1 - scale factors without joint stereo + * q2, q3 - scale factors with joint stereo + * q15 - joint stereo selection mask + */ +.macro calc_scalefactors + vmov.s32 q1, q0 + vmov.s32 q2, q0 + vmov.s32 q3, q0 + mov r3, r2 +1: + vld1.32 {d18, d19}, [r6, :128], r11 + vbic.s32 q11, q9, q14 + vld1.32 {d16, d17}, [r5, :128], r11 + vhadd.s32 q10, q8, q11 + vhsub.s32 q11, q8, q11 + vabs.s32 q8, q8 + vabs.s32 q9, q9 + vabs.s32 q10, q10 + vabs.s32 q11, q11 + vmax.s32 q0, q0, q8 + vmax.s32 q1, q1, q9 + vmax.s32 q2, q2, q10 + vmax.s32 q3, q3, q11 + subs r3, r3, #1 + bgt 1b + vsub.s32 q0, q0, q14 + vsub.s32 q1, q1, q14 + vsub.s32 q2, q2, q14 + vsub.s32 q3, q3, q14 + vclz.s32 q0, q0 + vclz.s32 q1, q1 + vclz.s32 q2, q2 + vclz.s32 q3, q3 + vsub.s32 q0, q13, q0 + vsub.s32 q1, q13, q1 + vsub.s32 q2, q13, q2 + vsub.s32 q3, q13, q3 +.endm + +/* + * constants: q14 = 1 + * input: q15 - joint stereo selection mask + * r5 - value set by calc_scalefactors macro + * r6 - value set by calc_scalefactors macro + */ +.macro update_joint_stereo_samples + sub r8, r6, r11 + sub r7, r5, r11 + sub r6, r6, r11, asl #1 + sub r5, r5, r11, asl #1 + vld1.32 {d18, d19}, [r6, :128] + vbic.s32 q11, q9, q14 + vld1.32 {d16, d17}, [r5, :128] + vld1.32 {d2, d3}, [r8, :128] + vbic.s32 q3, q1, q14 + vld1.32 {d0, d1}, [r7, :128] + vhsub.s32 q10, q8, q11 + vhadd.s32 q11, q8, q11 + vhsub.s32 q2, q0, q3 + vhadd.s32 q3, q0, q3 + vbif.s32 q10, q9, q15 + vbif.s32 d22, d16, d30 + sub r11, r10, r11, asl #1 + sub r3, r2, #2 +2: + vbif.s32 d23, d17, d31 + vst1.32 {d20, d21}, [r6, :128], r11 + vbif.s32 d4, d2, d30 + vld1.32 {d18, d19}, [r6, :128] + vbif.s32 d5, d3, d31 + vst1.32 {d22, d23}, [r5, :128], r11 + vbif.s32 d6, d0, d30 + vld1.32 {d16, d17}, [r5, :128] + vbif.s32 d7, d1, d31 + vst1.32 {d4, d5}, [r8, :128], r11 + vbic.s32 q11, q9, q14 + vld1.32 {d2, d3}, [r8, :128] + vst1.32 {d6, d7}, [r7, :128], r11 + vbic.s32 q3, q1, q14 + vld1.32 {d0, d1}, [r7, :128] + vhsub.s32 q10, q8, q11 + vhadd.s32 q11, q8, q11 + vhsub.s32 q2, q0, q3 + vhadd.s32 q3, q0, q3 + vbif.s32 q10, q9, q15 + vbif.s32 d22, d16, d30 + subs r3, r3, #2 + bgt 2b + sub r11, r10, r11, asr #1 + vbif.s32 d23, d17, d31 + vst1.32 {d20, d21}, [r6, :128] + vbif.s32 q2, q1, q15 + vst1.32 {d22, d23}, [r5, :128] + vbif.s32 q3, q0, q15 + vst1.32 {d4, d5}, [r8, :128] + vst1.32 {d6, d7}, [r7, :128] +.endm + +function ff_sbc_calc_scalefactors_j_neon, export=1 + @ parameters + @ r0 = in = sb_sample_f + @ r1 = out = scale_factor + @ r2 = blocks + @ r3 = subbands + @ local variables + @ r4 = consts = ff_sbcdsp_joint_bits_mask + @ r5 = in0 + @ r6 = in1 + @ r7 = out0 + @ r8 = out1 + @ r10 = zero + @ r11 = inc + @ return r0 = joint + + push {r3-r11} + movrelx r4, X(ff_sbcdsp_joint_bits_mask) + mov r10, #0 + mov r11, #64 + + vmov.s32 q14, #1 + vmov.s32 q13, #16 @ 31 - SCALE_OUT_BITS + + cmp r3, #4 + bne 8f + +4: @ 4 subbands + add r5, r0, #0 + add r6, r0, #32 + add r7, r1, #0 + add r8, r1, #32 + vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS + vadd.s32 q0, q0, q14 + + calc_scalefactors + + @ check whether to use joint stereo for subbands 0, 1, 2 + vadd.s32 q15, q0, q1 + vadd.s32 q9, q2, q3 + vmov.s32 d31[1], r10 @ last subband -> no joint + vld1.32 {d16, d17}, [r4, :128]! + vcgt.s32 q15, q15, q9 + + @ calculate and save to memory 'joint' variable + @ update and save scale factors to memory + vand.s32 q8, q8, q15 + vbit.s32 q0, q2, q15 + vpadd.s32 d16, d16, d17 + vbit.s32 q1, q3, q15 + vpadd.s32 d16, d16, d16 + vst1.32 {d0, d1}, [r7, :128] + vst1.32 {d2, d3}, [r8, :128] + vmov.32 r0, d16[0] + + update_joint_stereo_samples + b 9f + +8: @ 8 subbands + add r5, r0, #16 + add r6, r0, #48 + add r7, r1, #16 + add r8, r1, #48 + vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS + vadd.s32 q0, q0, q14 + + calc_scalefactors + + @ check whether to use joint stereo for subbands 4, 5, 6 + vadd.s32 q15, q0, q1 + vadd.s32 q9, q2, q3 + vmov.s32 d31[1], r10 @ last subband -> no joint + vld1.32 {d16, d17}, [r4, :128]! + vcgt.s32 q15, q15, q9 + + @ calculate part of 'joint' variable and save it to d24 + @ update and save scale factors to memory + vand.s32 q8, q8, q15 + vbit.s32 q0, q2, q15 + vpadd.s32 d16, d16, d17 + vbit.s32 q1, q3, q15 + vst1.32 {d0, d1}, [r7, :128] + vst1.32 {d2, d3}, [r8, :128] + vpadd.s32 d24, d16, d16 + + update_joint_stereo_samples + + add r5, r0, #0 + add r6, r0, #32 + add r7, r1, #0 + add r8, r1, #32 + vmov.s32 q0, #0x8000 @ 1 << SCALE_OUT_BITS + vadd.s32 q0, q0, q14 + + calc_scalefactors + + @ check whether to use joint stereo for subbands 0, 1, 2, 3 + vadd.s32 q15, q0, q1 + vadd.s32 q9, q2, q3 + vld1.32 {d16, d17}, [r4, :128]! + vcgt.s32 q15, q15, q9 + + @ combine last part of 'joint' with d24 and save to memory + @ update and save scale factors to memory + vand.s32 q8, q8, q15 + vbit.s32 q0, q2, q15 + vpadd.s32 d16, d16, d17 + vbit.s32 q1, q3, q15 + vpadd.s32 d16, d16, d16 + vst1.32 {d0, d1}, [r7, :128] + vadd.s32 d16, d16, d24 + vst1.32 {d2, d3}, [r8, :128] + vmov.32 r0, d16[0] + + update_joint_stereo_samples +9: + pop {r3-r11} + bx lr +endfunc + +function ff_sbc_enc_process_input_4s_neon, export=1 + @ parameters + @ r0 = positioin + @ r1 = pcm + @ r2 = X + @ r3 = nsamples + @ r4 = nchannels + @ local variables + @ r5 = ff_sbc_input_perm_4 + @ r6 = src / x + @ r7 = dst / y + + push {r1, r3-r7} + ldr r4, [sp, #24] + movrelx r5, X(ff_sbc_input_perm_4) + + @ handle X buffer wraparound + cmp r0, r3 + bge 1f @ if (position < nsamples) + add r7, r2, #576 @ &X[0][SBC_X_BUFFER_SIZE - 40] + add r6, r2, r0, lsl#1 @ &X[0][position] + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0}, [r6, :64]! + vst1.16 {d0}, [r7, :64]! + cmp r4, #1 + ble 2f @ if (nchannels > 1) + add r7, r2, #1232 @ &X[1][SBC_X_BUFFER_SIZE - 40] + add r6, r2, #656 + add r6, r6, r0, lsl#1 @ &X[1][position] + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0}, [r6, :64]! + vst1.16 {d0}, [r7, :64]! +2: + mov r0, #288 @ SBC_X_BUFFER_SIZE - 40 +1: + + add r6, r2, r0, lsl#1 @ &X[0][position] + add r7, r6, #656 @ &X[1][position] + + cmp r4, #1 + ble 8f @ if (nchannels > 1) + tst r1, #1 + beq 7f @ if (pcm & 1) + @ poor 'pcm' alignment + vld1.8 {d0, d1}, [r5, :128] +1: + sub r6, r6, #16 + sub r7, r7, #16 + sub r0, r0, #8 + vld1.8 {d4, d5}, [r1]! + vuzp.16 d4, d5 + vld1.8 {d20, d21}, [r1]! + vuzp.16 d20, d21 + vswp d5, d20 + vtbl.8 d16, {d4, d5}, d0 + vtbl.8 d17, {d4, d5}, d1 + vtbl.8 d18, {d20, d21}, d0 + vtbl.8 d19, {d20, d21}, d1 + vst1.16 {d16, d17}, [r6, :128] + vst1.16 {d18, d19}, [r7, :128] + subs r3, r3, #8 + bgt 1b + b 9f +7: + @ proper 'pcm' alignment + vld1.8 {d0, d1}, [r5, :128] +1: + sub r6, r6, #16 + sub r7, r7, #16 + sub r0, r0, #8 + vld2.16 {d4, d5}, [r1]! + vld2.16 {d20, d21}, [r1]! + vswp d5, d20 + vtbl.8 d16, {d4, d5}, d0 + vtbl.8 d17, {d4, d5}, d1 + vtbl.8 d18, {d20, d21}, d0 + vtbl.8 d19, {d20, d21}, d1 + vst1.16 {d16, d17}, [r6, :128] + vst1.16 {d18, d19}, [r7, :128] + subs r3, r3, #8 + bgt 1b + b 9f +8: + @ mono + vld1.8 {d0, d1}, [r5, :128] +1: + sub r6, r6, #16 + sub r0, r0, #8 + vld1.8 {d4, d5}, [r1]! + vtbl.8 d16, {d4, d5}, d0 + vtbl.8 d17, {d4, d5}, d1 + vst1.16 {d16, d17}, [r6, :128] + subs r3, r3, #8 + bgt 1b +9: + pop {r1, r3-r7} + bx lr +endfunc + +function ff_sbc_enc_process_input_8s_neon, export=1 + @ parameters + @ r0 = positioin + @ r1 = pcm + @ r2 = X + @ r3 = nsamples + @ r4 = nchannels + @ local variables + @ r5 = ff_sbc_input_perm_8 + @ r6 = src + @ r7 = dst + + push {r1, r3-r7} + ldr r4, [sp, #24] + movrelx r5, X(ff_sbc_input_perm_8) + + @ handle X buffer wraparound + cmp r0, r3 + bge 1f @ if (position < nsamples) + add r7, r2, #512 @ &X[0][SBC_X_BUFFER_SIZE - 72] + add r6, r2, r0, lsl#1 @ &X[0][position] + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1}, [r6, :128]! + vst1.16 {d0, d1}, [r7, :128]! + cmp r4, #1 + ble 2f @ if (nchannels > 1) + add r7, r2, #1168 @ &X[1][SBC_X_BUFFER_SIZE - 72] + add r6, r2, #656 + add r6, r6, r0, lsl#1 @ &X[1][position] + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1, d2, d3}, [r6, :128]! + vst1.16 {d0, d1, d2, d3}, [r7, :128]! + vld1.16 {d0, d1}, [r6, :128]! + vst1.16 {d0, d1}, [r7, :128]! +2: + mov r0, #256 @ SBC_X_BUFFER_SIZE - 72 +1: + + add r6, r2, r0, lsl#1 @ &X[0][position] + add r7, r6, #656 @ &X[1][position] + + cmp r4, #1 + ble 8f @ if (nchannels > 1) + tst r1, #1 + beq 7f @ if (pcm & 1) + @ poor 'pcm' alignment + vld1.8 {d0, d1, d2, d3}, [r5, :128] +1: + sub r6, r6, #32 + sub r7, r7, #32 + sub r0, r0, #16 + vld1.8 {d4, d5, d6, d7}, [r1]! + vuzp.16 q2, q3 + vld1.8 {d20, d21, d22, d23}, [r1]! + vuzp.16 q10, q11 + vswp q3, q10 + vtbl.8 d16, {d4, d5, d6, d7}, d0 + vtbl.8 d17, {d4, d5, d6, d7}, d1 + vtbl.8 d18, {d4, d5, d6, d7}, d2 + vtbl.8 d19, {d4, d5, d6, d7}, d3 + vst1.16 {d16, d17, d18, d19}, [r6, :128] + vtbl.8 d16, {d20, d21, d22, d23}, d0 + vtbl.8 d17, {d20, d21, d22, d23}, d1 + vtbl.8 d18, {d20, d21, d22, d23}, d2 + vtbl.8 d19, {d20, d21, d22, d23}, d3 + vst1.16 {d16, d17, d18, d19}, [r7, :128] + subs r3, r3, #16 + bgt 1b + b 9f +7: + @ proper 'pcm' alignment + vld1.8 {d0, d1, d2, d3}, [r5, :128] +1: + sub r6, r6, #32 + sub r7, r7, #32 + sub r0, r0, #16 + vld2.16 {d4, d5, d6, d7}, [r1]! + vld2.16 {d20, d21, d22, d23}, [r1]! + vswp q3, q10 + vtbl.8 d16, {d4, d5, d6, d7}, d0 + vtbl.8 d17, {d4, d5, d6, d7}, d1 + vtbl.8 d18, {d4, d5, d6, d7}, d2 + vtbl.8 d19, {d4, d5, d6, d7}, d3 + vst1.16 {d16, d17, d18, d19}, [r6, :128] + vtbl.8 d16, {d20, d21, d22, d23}, d0 + vtbl.8 d17, {d20, d21, d22, d23}, d1 + vtbl.8 d18, {d20, d21, d22, d23}, d2 + vtbl.8 d19, {d20, d21, d22, d23}, d3 + vst1.16 {d16, d17, d18, d19}, [r7, :128] + subs r3, r3, #16 + bgt 1b + b 9f +8: + @ mono + vld1.8 {d0, d1, d2, d3}, [r5, :128] +1: + sub r6, r6, #32 + sub r0, r0, #16 + vld1.8 {d4, d5, d6, d7}, [r1]! + vtbl.8 d16, {d4, d5, d6, d7}, d0 + vtbl.8 d17, {d4, d5, d6, d7}, d1 + vtbl.8 d18, {d4, d5, d6, d7}, d2 + vtbl.8 d19, {d4, d5, d6, d7}, d3 + vst1.16 {d16, d17, d18, d19}, [r6, :128] + subs r3, r3, #16 + bgt 1b +9: + pop {r1, r3-r7} + bx lr +endfunc diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c index 2d0addcf28..e745595da0 100644 --- a/libavcodec/sbcdsp.c +++ b/libavcodec/sbcdsp.c @@ -380,6 +380,8 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) s->sbc_calc_scalefactors = sbc_calc_scalefactors; s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; + if (ARCH_ARM) + ff_sbcdsp_init_arm(s); if (ARCH_X86) ff_sbcdsp_init_x86(s); } diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h index 127e6a8a11..334c058e6d 100644 --- a/libavcodec/sbcdsp.h +++ b/libavcodec/sbcdsp.h @@ -80,6 +80,7 @@ struct sbc_dsp_context { */ void ff_sbcdsp_init(SBCDSPContext *s); +void ff_sbcdsp_init_arm(SBCDSPContext *s); void ff_sbcdsp_init_x86(SBCDSPContext *s); #endif /* AVCODEC_SBCDSP_H */