ffmpeg/libavcodec/arm/dcadsp_vfp.S

/*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

POUT          .req    a1
PIN           .req    a2
PCOEF         .req    a3
OLDFPSCR      .req    a4
COUNTER       .req    ip

IN0           .req    s4
IN1           .req    s5
IN2           .req    s6
IN3           .req    s7
IN4           .req    s0
IN5           .req    s1
IN6           .req    s2
IN7           .req    s3
COEF0         .req    s8   @ coefficient elements
COEF1         .req    s9
COEF2         .req    s10
COEF3         .req    s11
COEF4         .req    s12
COEF5         .req    s13
COEF6         .req    s14
COEF7         .req    s15
ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
ACCUM4        .req    s20
POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
POST1         .req    s25
POST2         .req    s26
POST3         .req    s27


.macro inner_loop  decifactor, dir, tail, head
 .ifc "\dir","up"
  .set X, 0
  .set Y, 4
 .else
  .set X, 4*JMAX*4 - 4
  .set Y, -4
 .endif
 .ifnc "\head",""
        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
 .endif
 .ifnc "\tail",""
        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
 .endif
 .ifnc "\head",""
        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
 .endif
 .ifnc "\head",""
        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
   .ifc "\tail",""
        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
   .endif
        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
   .ifnc "\tail",""
        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
   .endif
        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
 .endif
 .ifnc "\tail",""
        vstmia  POUT!, {POST0-POST3}
 .endif
 .ifnc "\head",""
        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
  .if \decifactor == 32
        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
  .endif
 .endif
.endm

.macro dca_lfe_fir  decifactor
function ff_dca_lfe_fir\decifactor\()_vfp, export=1
        fmrx    OLDFPSCR, FPSCR
        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, ip
        vldr    IN0, [PIN, #-0*4]
        vldr    IN1, [PIN, #-1*4]
        vldr    IN2, [PIN, #-2*4]
        vldr    IN3, [PIN, #-3*4]
 .if \decifactor == 32
  .set JMAX, 8
        vpush   {s16-s31}
        vldr    IN4, [PIN, #-4*4]
        vldr    IN5, [PIN, #-5*4]
        vldr    IN6, [PIN, #-6*4]
        vldr    IN7, [PIN, #-7*4]
 .else
  .set JMAX, 4
        vpush   {s16-s27}
 .endif

        mov     COUNTER, #\decifactor/4 - 1
        inner_loop  \decifactor, up,, head
1:      add     PCOEF, PCOEF, #4*JMAX*4
        subs    COUNTER, COUNTER, #1
        inner_loop  \decifactor, up, tail, head
        bne     1b
        inner_loop  \decifactor, up, tail

        mov     COUNTER, #\decifactor/4 - 1
        inner_loop  \decifactor, down,, head
1:      sub     PCOEF, PCOEF, #4*JMAX*4
        subs    COUNTER, COUNTER, #1
        inner_loop  \decifactor, down, tail, head
        bne     1b
        inner_loop  \decifactor, down, tail

 .if \decifactor == 32
        vpop    {s16-s31}
 .else
        vpop    {s16-s27}
 .endif
        fmxr    FPSCR, OLDFPSCR
        bx      lr
endfunc
.endm

        dca_lfe_fir  64
 .ltorg
        dca_lfe_fir  32

        .unreq  POUT
        .unreq  PIN
        .unreq  PCOEF
        .unreq  OLDFPSCR
        .unreq  COUNTER

        .unreq  IN0
        .unreq  IN1
        .unreq  IN2
        .unreq  IN3
        .unreq  IN4
        .unreq  IN5
        .unreq  IN6
        .unreq  IN7
        .unreq  COEF0
        .unreq  COEF1
        .unreq  COEF2
        .unreq  COEF3
        .unreq  COEF4
        .unreq  COEF5
        .unreq  COEF6
        .unreq  COEF7
        .unreq  ACCUM0
        .unreq  ACCUM4
        .unreq  POST0
        .unreq  POST1
        .unreq  POST2
        .unreq  POST3


IN      .req    a1
SBACT   .req    a2
OLDFPSCR .req   a3
IMDCT   .req    a4
WINDOW  .req    v1
OUT     .req    v2
BUF     .req    v3
SCALEINT .req   v4 @ only used in softfp case
COUNT   .req    v5

SCALE   .req    s0

/* Stack layout differs in softfp and hardfp cases:
 *
 * hardfp
 *      fp -> 6 arg words saved by caller
 *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
 *            s16-s23 on entry
 *            align 16
 *     buf -> 8*32*4 bytes buffer
 *            s0 on entry
 *      sp -> 3 arg words for callee
 *
 * softfp
 *      fp -> 7 arg words saved by caller
 *            a4,v1-v5,fp,lr on entry
 *            s16-s23 on entry
 *            align 16
 *     buf -> 8*32*4 bytes buffer
 *      sp -> 4 arg words for callee
 */

/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
 *                                 SynthFilterContext *synth, FFTContext *imdct,
 *                                 float (*synth_buf_ptr)[512],
 *                                 int *synth_buf_offset, float (*synth_buf2)[32],
 *                                 const float (*window)[512], float *samples_out,
 *                                 float (*raXin)[32], float scale);
 */
function ff_dca_qmf_32_subbands_vfp, export=1
VFP     push    {a3-a4,v1-v3,v5,fp,lr}
NOVFP   push    {a4,v1-v5,fp,lr}
        add     fp, sp, #8*4
        vpush   {s16-s23}
        @ The buffer pointed at by raXin isn't big enough for us to do a
        @ complete matrix transposition as we want to, so allocate an
        @ alternative buffer from the stack. Align to 4 words for speed.
        sub     BUF, sp, #8*32*4
        bic     BUF, BUF, #15
        mov     sp, BUF
        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
        fmrx    OLDFPSCR, FPSCR
        fmxr    FPSCR, lr
        @ COUNT is used to count down 2 things at once:
        @ bits 0-4 are the number of word pairs remaining in the output row
        @ bits 5-31 are the number of words to copy (with possible negation)
        @   from the source matrix before we start zeroing the remainder
        mov     COUNT, #(-4 << 5) + 16
        adds    COUNT, COUNT, SBACT, lsl #5
        bmi     2f
1:
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  [IN, #(1*8+0)*4]
        vldr    s11, [IN, #(1*8+1)*4]
        vldr    s13, [IN, #(1*8+2)*4]
        vldr    s15, [IN, #(1*8+3)*4]
        vneg.f  s16, s16
        vldr    s17, [IN, #(1*8+4)*4]
        vldr    s19, [IN, #(1*8+5)*4]
        vldr    s21, [IN, #(1*8+6)*4]
        vldr    s23, [IN, #(1*8+7)*4]
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        vldr    s9,  [IN, #(3*8+0)*4]
        vldr    s11, [IN, #(3*8+1)*4]
        vldr    s13, [IN, #(3*8+2)*4]
        vldr    s15, [IN, #(3*8+3)*4]
        vldr    s17, [IN, #(3*8+4)*4]
        vldr    s19, [IN, #(3*8+5)*4]
        vldr    s21, [IN, #(3*8+6)*4]
        vldr    s23, [IN, #(3*8+7)*4]
        vneg.f  s9, s9
        vldr    s8,  [IN, #(2*8+0)*4]
        vldr    s10, [IN, #(2*8+1)*4]
        vldr    s12, [IN, #(2*8+2)*4]
        vldr    s14, [IN, #(2*8+3)*4]
        vneg.f  s17, s17
        vldr    s16, [IN, #(2*8+4)*4]
        vldr    s18, [IN, #(2*8+5)*4]
        vldr    s20, [IN, #(2*8+6)*4]
        vldr    s22, [IN, #(2*8+7)*4]
        vstr    d4,  [BUF, #(0*32+2)*4]
        vstr    d5,  [BUF, #(1*32+2)*4]
        vstr    d6,  [BUF, #(2*32+2)*4]
        vstr    d7,  [BUF, #(3*32+2)*4]
        vstr    d8,  [BUF, #(4*32+2)*4]
        vstr    d9,  [BUF, #(5*32+2)*4]
        vstr    d10, [BUF, #(6*32+2)*4]
        vstr    d11, [BUF, #(7*32+2)*4]
        add     IN, IN, #4*8*4
        add     BUF, BUF, #4*4
        subs    COUNT, COUNT, #(4 << 5) + 2
        bpl     1b
2:      @ Now deal with trailing < 4 samples
        adds    COUNT, COUNT, #3 << 5
        bmi     4f  @ sb_act was a multiple of 4
        bics    lr, COUNT, #0x1F
        bne     3f
        @ sb_act was n*4+1
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  zero
        vldr    s11, zero
        vldr    s13, zero
        vldr    s15, zero
        vneg.f  s16, s16
        vldr    s17, zero
        vldr    s19, zero
        vldr    s21, zero
        vldr    s23, zero
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #1
        b       4f
3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  [IN, #(1*8+0)*4]
        vldr    s11, [IN, #(1*8+1)*4]
        vldr    s13, [IN, #(1*8+2)*4]
        vldr    s15, [IN, #(1*8+3)*4]
        vneg.f  s16, s16
        vldr    s17, [IN, #(1*8+4)*4]
        vldr    s19, [IN, #(1*8+5)*4]
        vldr    s21, [IN, #(1*8+6)*4]
        vldr    s23, [IN, #(1*8+7)*4]
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #(2 << 5) + 1
        bics    lr, COUNT, #0x1F
        bne     4f
        @ sb_act was n*4+3
        vldr    s8,  [IN, #(2*8+0)*4]
        vldr    s10, [IN, #(2*8+1)*4]
        vldr    s12, [IN, #(2*8+2)*4]
        vldr    s14, [IN, #(2*8+3)*4]
        vldr    s16, [IN, #(2*8+4)*4]
        vldr    s18, [IN, #(2*8+5)*4]
        vldr    s20, [IN, #(2*8+6)*4]
        vldr    s22, [IN, #(2*8+7)*4]
        vldr    s9,  zero
        vldr    s11, zero
        vldr    s13, zero
        vldr    s15, zero
        vldr    s17, zero
        vldr    s19, zero
        vldr    s21, zero
        vldr    s23, zero
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #1
4:      @ Now fill the remainder with 0
        vldr    s8, zero
        vldr    s9, zero
        ands    COUNT, COUNT, #0x1F
        beq     6f
5:      vstr    d4, [BUF, #(0*32+0)*4]
        vstr    d4, [BUF, #(1*32+0)*4]
        vstr    d4, [BUF, #(2*32+0)*4]
        vstr    d4, [BUF, #(3*32+0)*4]
        vstr    d4, [BUF, #(4*32+0)*4]
        vstr    d4, [BUF, #(5*32+0)*4]
        vstr    d4, [BUF, #(6*32+0)*4]
        vstr    d4, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        subs    COUNT, COUNT, #1
        bne     5b
6:
        fmxr    FPSCR, OLDFPSCR
        ldr     WINDOW, [fp, #3*4]
        ldr     OUT, [fp, #4*4]
        sub     BUF, BUF, #32*4
NOVFP   ldr     SCALEINT, [fp, #6*4]
        mov     COUNT, #8
VFP     vpush   {SCALE}
VFP     sub     sp, sp, #3*4
NOVFP   sub     sp, sp, #4*4
7:
VFP     ldr     a1, [fp, #-7*4]     @ imdct
NOVFP   ldr     a1, [fp, #-8*4]
        ldmia   fp, {a2-a4}
VFP     stmia   sp, {WINDOW, OUT, BUF}
NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
VFP     vldr    SCALE, [sp, #3*4]
        bl      X(ff_synth_filter_float_vfp)
        add     OUT, OUT, #32*4
        add     BUF, BUF, #32*4
        subs    COUNT, COUNT, #1
        bne     7b

A       sub     sp, fp, #(8+8)*4
T       sub     fp, fp, #(8+8)*4
T       mov     sp, fp
        vpop    {s16-s23}
VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
NOVFP   pop     {a4,v1-v5,fp,pc}
endfunc

        .unreq  IN
        .unreq  SBACT
        .unreq  OLDFPSCR
        .unreq  IMDCT
        .unreq  WINDOW
        .unreq  OUT
        .unreq  BUF
        .unreq  SCALEINT
        .unreq  COUNT

        .unreq  SCALE

        .align 2
zero:   .word   0
arm: Add VFP-accelerated version of dca_lfe_fir Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-19 08:03:32 +00:00			`/*`
			`* Copyright (c) 2013 RISC OS Open Ltd`
			`* Author: Ben Avison <bavison@riscosopen.org>`
			`*`
			`* This file is part of Libav.`
			`*`
			`* Libav is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* Libav is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with Libav; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "libavutil/arm/asm.S"`

			`POUT .req a1`
			`PIN .req a2`
			`PCOEF .req a3`
			`OLDFPSCR .req a4`
			`COUNTER .req ip`

			`IN0 .req s4`
			`IN1 .req s5`
			`IN2 .req s6`
			`IN3 .req s7`
			`IN4 .req s0`
			`IN5 .req s1`
			`IN6 .req s2`
			`IN7 .req s3`
			`COEF0 .req s8 @ coefficient elements`
			`COEF1 .req s9`
			`COEF2 .req s10`
			`COEF3 .req s11`
			`COEF4 .req s12`
			`COEF5 .req s13`
			`COEF6 .req s14`
			`COEF7 .req s15`
			`ACCUM0 .req s16 @ double-buffered multiply-accumulate results`
			`ACCUM4 .req s20`
			`POST0 .req s24 @ do long-latency post-multiply in this vector in parallel`
			`POST1 .req s25`
			`POST2 .req s26`
			`POST3 .req s27`


			`.macro inner_loop decifactor, dir, tail, head`
			`.ifc "\dir","up"`
			`.set X, 0`
			`.set Y, 4`
			`.else`
			`.set X, 4JMAX4 - 4`
			`.set Y, -4`
			`.endif`
			`.ifnc "\head",""`
			`vldr COEF0, [PCOEF, #X + (0JMAX + 0) Y]`
			`vldr COEF1, [PCOEF, #X + (1JMAX + 0) Y]`
			`vldr COEF2, [PCOEF, #X + (2JMAX + 0) Y]`
			`vldr COEF3, [PCOEF, #X + (3JMAX + 0) Y]`
			`.endif`
			`.ifnc "\tail",""`
			`vadd.f POST0, ACCUM0, ACCUM4 @ vector operation`
			`.endif`
			`.ifnc "\head",""`
			`vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar`
			`vldr COEF4, [PCOEF, #X + (0JMAX + 1) Y]`
			`vldr COEF5, [PCOEF, #X + (1JMAX + 1) Y]`
			`vldr COEF6, [PCOEF, #X + (2JMAX + 1) Y]`
			`.endif`
			`.ifnc "\head",""`
			`vldr COEF7, [PCOEF, #X + (3JMAX + 1) Y]`
			`.ifc "\tail",""`
			`vmul.f ACCUM4, COEF4, IN1 @ vector operation`
			`.endif`
			`vldr COEF0, [PCOEF, #X + (0JMAX + 2) Y]`
			`vldr COEF1, [PCOEF, #X + (1JMAX + 2) Y]`
			`.ifnc "\tail",""`
			`vmul.f ACCUM4, COEF4, IN1 @ vector operation`
			`.endif`
			`vldr COEF2, [PCOEF, #X + (2JMAX + 2) Y]`
			`vldr COEF3, [PCOEF, #X + (3JMAX + 2) Y]`
			`.endif`
			`.ifnc "\tail",""`
			`vstmia POUT!, {POST0-POST3}`
			`.endif`
			`.ifnc "\head",""`
			`vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar`
			`vldr COEF4, [PCOEF, #X + (0JMAX + 3) Y]`
			`vldr COEF5, [PCOEF, #X + (1JMAX + 3) Y]`
			`vldr COEF6, [PCOEF, #X + (2JMAX + 3) Y]`
			`vldr COEF7, [PCOEF, #X + (3JMAX + 3) Y]`
			`vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar`
			`.if \decifactor == 32`
			`vldr COEF0, [PCOEF, #X + (0JMAX + 4) Y]`
			`vldr COEF1, [PCOEF, #X + (1JMAX + 4) Y]`
			`vldr COEF2, [PCOEF, #X + (2JMAX + 4) Y]`
			`vldr COEF3, [PCOEF, #X + (3JMAX + 4) Y]`
			`vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar`
			`vldr COEF4, [PCOEF, #X + (0JMAX + 5) Y]`
			`vldr COEF5, [PCOEF, #X + (1JMAX + 5) Y]`
			`vldr COEF6, [PCOEF, #X + (2JMAX + 5) Y]`
			`vldr COEF7, [PCOEF, #X + (3JMAX + 5) Y]`
			`vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar`
			`vldr COEF0, [PCOEF, #X + (0JMAX + 6) Y]`
			`vldr COEF1, [PCOEF, #X + (1JMAX + 6) Y]`
			`vldr COEF2, [PCOEF, #X + (2JMAX + 6) Y]`
			`vldr COEF3, [PCOEF, #X + (3JMAX + 6) Y]`
			`vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar`
			`vldr COEF4, [PCOEF, #X + (0JMAX + 7) Y]`
			`vldr COEF5, [PCOEF, #X + (1JMAX + 7) Y]`
			`vldr COEF6, [PCOEF, #X + (2JMAX + 7) Y]`
			`vldr COEF7, [PCOEF, #X + (3JMAX + 7) Y]`
			`vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar`
			`.endif`
			`.endif`
			`.endm`

			`.macro dca_lfe_fir decifactor`
dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-05 23:40:52 +00:00			`function ff_dca_lfe_fir\decifactor\()_vfp, export=1`
			`fmrx OLDFPSCR, FPSCR`
			`ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1`
			`fmxr FPSCR, ip`
			`vldr IN0, [PIN, #-0*4]`
			`vldr IN1, [PIN, #-1*4]`
			`vldr IN2, [PIN, #-2*4]`
			`vldr IN3, [PIN, #-3*4]`
arm: Add VFP-accelerated version of dca_lfe_fir Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-19 08:03:32 +00:00			`.if \decifactor == 32`
			`.set JMAX, 8`
			`vpush {s16-s31}`
			`vldr IN4, [PIN, #-4*4]`
			`vldr IN5, [PIN, #-5*4]`
			`vldr IN6, [PIN, #-6*4]`
			`vldr IN7, [PIN, #-7*4]`
			`.else`
			`.set JMAX, 4`
			`vpush {s16-s27}`
			`.endif`

			`mov COUNTER, #\decifactor/4 - 1`
			`inner_loop \decifactor, up,, head`
			`1: add PCOEF, PCOEF, #4JMAX4`
			`subs COUNTER, COUNTER, #1`
			`inner_loop \decifactor, up, tail, head`
			`bne 1b`
			`inner_loop \decifactor, up, tail`

			`mov COUNTER, #\decifactor/4 - 1`
			`inner_loop \decifactor, down,, head`
			`1: sub PCOEF, PCOEF, #4JMAX4`
			`subs COUNTER, COUNTER, #1`
			`inner_loop \decifactor, down, tail, head`
			`bne 1b`
			`inner_loop \decifactor, down, tail`

			`.if \decifactor == 32`
			`vpop {s16-s31}`
			`.else`
			`vpop {s16-s27}`
			`.endif`
			`fmxr FPSCR, OLDFPSCR`
			`bx lr`
dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-05 23:40:52 +00:00			`endfunc`
arm: Add VFP-accelerated version of dca_lfe_fir Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-19 08:03:32 +00:00			`.endm`

dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-05 23:40:52 +00:00			`dca_lfe_fir 64`
arm: Add VFP-accelerated version of dca_lfe_fir Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-19 08:03:32 +00:00			`.ltorg`
dcadsp: split lfe_dir cases The x86 runs short on registers because numerous elements are not static. In addition, splitting them allows more optimized code, at least for x86. Arm asm changes by Janne Grunau. Signed-off-by: Janne Grunau <janne-libav@jannau.net> 2014-02-05 23:40:52 +00:00			`dca_lfe_fir 32`
arm: Add VFP-accelerated version of dca_lfe_fir Before After Mean StdDev Mean StdDev Change This function 868.2 33.5 436.0 27.0 +99.1% Overall 15973.0 223.2 15577.5 83.2 +2.5% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-19 08:03:32 +00:00
			`.unreq POUT`
			`.unreq PIN`
			`.unreq PCOEF`
			`.unreq OLDFPSCR`
			`.unreq COUNTER`

			`.unreq IN0`
			`.unreq IN1`
			`.unreq IN2`
			`.unreq IN3`
			`.unreq IN4`
			`.unreq IN5`
			`.unreq IN6`
			`.unreq IN7`
			`.unreq COEF0`
			`.unreq COEF1`
			`.unreq COEF2`
			`.unreq COEF3`
			`.unreq COEF4`
			`.unreq COEF5`
			`.unreq COEF6`
			`.unreq COEF7`
			`.unreq ACCUM0`
			`.unreq ACCUM4`
			`.unreq POST0`
			`.unreq POST1`
			`.unreq POST2`
			`.unreq POST3`
arm: Add VFP-accelerated version of qmf_32_subbands Before After Mean StdDev Mean StdDev Change This function 1323.0 98.0 746.2 60.6 +77.3% Overall 15400.0 336.4 14147.5 288.4 +8.9% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-15 17:28:17 +00:00

			`IN .req a1`
			`SBACT .req a2`
			`OLDFPSCR .req a3`
			`IMDCT .req a4`
			`WINDOW .req v1`
			`OUT .req v2`
			`BUF .req v3`
			`SCALEINT .req v4 @ only used in softfp case`
			`COUNT .req v5`

			`SCALE .req s0`

			`/* Stack layout differs in softfp and hardfp cases:`
			`*`
			`* hardfp`
			`* fp -> 6 arg words saved by caller`
			`* a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)`
			`* s16-s23 on entry`
			`* align 16`
			`* buf -> 8324 bytes buffer`
			`* s0 on entry`
			`* sp -> 3 arg words for callee`
			`*`
			`* softfp`
			`* fp -> 7 arg words saved by caller`
			`* a4,v1-v5,fp,lr on entry`
			`* s16-s23 on entry`
			`* align 16`
			`* buf -> 8324 bytes buffer`
			`* sp -> 4 arg words for callee`
			`*/`

			`/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,`
			`* SynthFilterContext synth, FFTContext imdct,`
			`* float (*synth_buf_ptr)[512],`
			`* int synth_buf_offset, float (synth_buf2)[32],`
			`* const float (window)[512], float samples_out,`
			`* float (*raXin)[32], float scale);`
			`*/`
			`function ff_dca_qmf_32_subbands_vfp, export=1`
			`VFP push {a3-a4,v1-v3,v5,fp,lr}`
			`NOVFP push {a4,v1-v5,fp,lr}`
			`add fp, sp, #8*4`
			`vpush {s16-s23}`
			`@ The buffer pointed at by raXin isn't big enough for us to do a`
			`@ complete matrix transposition as we want to, so allocate an`
			`@ alternative buffer from the stack. Align to 4 words for speed.`
			`sub BUF, sp, #8324`
			`bic BUF, BUF, #15`
			`mov sp, BUF`
			`ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2`
			`fmrx OLDFPSCR, FPSCR`
			`fmxr FPSCR, lr`
			`@ COUNT is used to count down 2 things at once:`
			`@ bits 0-4 are the number of word pairs remaining in the output row`
			`@ bits 5-31 are the number of words to copy (with possible negation)`
			`@ from the source matrix before we start zeroing the remainder`
			`mov COUNT, #(-4 << 5) + 16`
			`adds COUNT, COUNT, SBACT, lsl #5`
			`bmi 2f`
			`1:`
			`vldr s8, [IN, #(08+0)4]`
			`vldr s10, [IN, #(08+1)4]`
			`vldr s12, [IN, #(08+2)4]`
			`vldr s14, [IN, #(08+3)4]`
			`vldr s16, [IN, #(08+4)4]`
			`vldr s18, [IN, #(08+5)4]`
			`vldr s20, [IN, #(08+6)4]`
			`vldr s22, [IN, #(08+7)4]`
			`vneg.f s8, s8`
			`vldr s9, [IN, #(18+0)4]`
			`vldr s11, [IN, #(18+1)4]`
			`vldr s13, [IN, #(18+2)4]`
			`vldr s15, [IN, #(18+3)4]`
			`vneg.f s16, s16`
			`vldr s17, [IN, #(18+4)4]`
			`vldr s19, [IN, #(18+5)4]`
			`vldr s21, [IN, #(18+6)4]`
			`vldr s23, [IN, #(18+7)4]`
			`vstr d4, [BUF, #(032+0)4]`
			`vstr d5, [BUF, #(132+0)4]`
			`vstr d6, [BUF, #(232+0)4]`
			`vstr d7, [BUF, #(332+0)4]`
			`vstr d8, [BUF, #(432+0)4]`
			`vstr d9, [BUF, #(532+0)4]`
			`vstr d10, [BUF, #(632+0)4]`
			`vstr d11, [BUF, #(732+0)4]`
			`vldr s9, [IN, #(38+0)4]`
			`vldr s11, [IN, #(38+1)4]`
			`vldr s13, [IN, #(38+2)4]`
			`vldr s15, [IN, #(38+3)4]`
			`vldr s17, [IN, #(38+4)4]`
			`vldr s19, [IN, #(38+5)4]`
			`vldr s21, [IN, #(38+6)4]`
			`vldr s23, [IN, #(38+7)4]`
			`vneg.f s9, s9`
			`vldr s8, [IN, #(28+0)4]`
			`vldr s10, [IN, #(28+1)4]`
			`vldr s12, [IN, #(28+2)4]`
			`vldr s14, [IN, #(28+3)4]`
			`vneg.f s17, s17`
			`vldr s16, [IN, #(28+4)4]`
			`vldr s18, [IN, #(28+5)4]`
			`vldr s20, [IN, #(28+6)4]`
			`vldr s22, [IN, #(28+7)4]`
			`vstr d4, [BUF, #(032+2)4]`
			`vstr d5, [BUF, #(132+2)4]`
			`vstr d6, [BUF, #(232+2)4]`
			`vstr d7, [BUF, #(332+2)4]`
			`vstr d8, [BUF, #(432+2)4]`
			`vstr d9, [BUF, #(532+2)4]`
			`vstr d10, [BUF, #(632+2)4]`
			`vstr d11, [BUF, #(732+2)4]`
			`add IN, IN, #484`
			`add BUF, BUF, #4*4`
			`subs COUNT, COUNT, #(4 << 5) + 2`
			`bpl 1b`
			`2: @ Now deal with trailing < 4 samples`
			`adds COUNT, COUNT, #3 << 5`
			`bmi 4f @ sb_act was a multiple of 4`
			`bics lr, COUNT, #0x1F`
			`bne 3f`
			`@ sb_act was n*4+1`
			`vldr s8, [IN, #(08+0)4]`
			`vldr s10, [IN, #(08+1)4]`
			`vldr s12, [IN, #(08+2)4]`
			`vldr s14, [IN, #(08+3)4]`
			`vldr s16, [IN, #(08+4)4]`
			`vldr s18, [IN, #(08+5)4]`
			`vldr s20, [IN, #(08+6)4]`
			`vldr s22, [IN, #(08+7)4]`
			`vneg.f s8, s8`
			`vldr s9, zero`
			`vldr s11, zero`
			`vldr s13, zero`
			`vldr s15, zero`
			`vneg.f s16, s16`
			`vldr s17, zero`
			`vldr s19, zero`
			`vldr s21, zero`
			`vldr s23, zero`
			`vstr d4, [BUF, #(032+0)4]`
			`vstr d5, [BUF, #(132+0)4]`
			`vstr d6, [BUF, #(232+0)4]`
			`vstr d7, [BUF, #(332+0)4]`
			`vstr d8, [BUF, #(432+0)4]`
			`vstr d9, [BUF, #(532+0)4]`
			`vstr d10, [BUF, #(632+0)4]`
			`vstr d11, [BUF, #(732+0)4]`
			`add BUF, BUF, #2*4`
			`sub COUNT, COUNT, #1`
			`b 4f`
			`3: @ sb_act was n4+2 or n4+3, so do the first 2`
			`vldr s8, [IN, #(08+0)4]`
			`vldr s10, [IN, #(08+1)4]`
			`vldr s12, [IN, #(08+2)4]`
			`vldr s14, [IN, #(08+3)4]`
			`vldr s16, [IN, #(08+4)4]`
			`vldr s18, [IN, #(08+5)4]`
			`vldr s20, [IN, #(08+6)4]`
			`vldr s22, [IN, #(08+7)4]`
			`vneg.f s8, s8`
			`vldr s9, [IN, #(18+0)4]`
			`vldr s11, [IN, #(18+1)4]`
			`vldr s13, [IN, #(18+2)4]`
			`vldr s15, [IN, #(18+3)4]`
			`vneg.f s16, s16`
			`vldr s17, [IN, #(18+4)4]`
			`vldr s19, [IN, #(18+5)4]`
			`vldr s21, [IN, #(18+6)4]`
			`vldr s23, [IN, #(18+7)4]`
			`vstr d4, [BUF, #(032+0)4]`
			`vstr d5, [BUF, #(132+0)4]`
			`vstr d6, [BUF, #(232+0)4]`
			`vstr d7, [BUF, #(332+0)4]`
			`vstr d8, [BUF, #(432+0)4]`
			`vstr d9, [BUF, #(532+0)4]`
			`vstr d10, [BUF, #(632+0)4]`
			`vstr d11, [BUF, #(732+0)4]`
			`add BUF, BUF, #2*4`
			`sub COUNT, COUNT, #(2 << 5) + 1`
			`bics lr, COUNT, #0x1F`
			`bne 4f`
			`@ sb_act was n*4+3`
			`vldr s8, [IN, #(28+0)4]`
			`vldr s10, [IN, #(28+1)4]`
			`vldr s12, [IN, #(28+2)4]`
			`vldr s14, [IN, #(28+3)4]`
			`vldr s16, [IN, #(28+4)4]`
			`vldr s18, [IN, #(28+5)4]`
			`vldr s20, [IN, #(28+6)4]`
			`vldr s22, [IN, #(28+7)4]`
			`vldr s9, zero`
			`vldr s11, zero`
			`vldr s13, zero`
			`vldr s15, zero`
			`vldr s17, zero`
			`vldr s19, zero`
			`vldr s21, zero`
			`vldr s23, zero`
			`vstr d4, [BUF, #(032+0)4]`
			`vstr d5, [BUF, #(132+0)4]`
			`vstr d6, [BUF, #(232+0)4]`
			`vstr d7, [BUF, #(332+0)4]`
			`vstr d8, [BUF, #(432+0)4]`
			`vstr d9, [BUF, #(532+0)4]`
			`vstr d10, [BUF, #(632+0)4]`
			`vstr d11, [BUF, #(732+0)4]`
			`add BUF, BUF, #2*4`
			`sub COUNT, COUNT, #1`
			`4: @ Now fill the remainder with 0`
			`vldr s8, zero`
			`vldr s9, zero`
			`ands COUNT, COUNT, #0x1F`
			`beq 6f`
			`5: vstr d4, [BUF, #(032+0)4]`
			`vstr d4, [BUF, #(132+0)4]`
			`vstr d4, [BUF, #(232+0)4]`
			`vstr d4, [BUF, #(332+0)4]`
			`vstr d4, [BUF, #(432+0)4]`
			`vstr d4, [BUF, #(532+0)4]`
			`vstr d4, [BUF, #(632+0)4]`
			`vstr d4, [BUF, #(732+0)4]`
			`add BUF, BUF, #2*4`
			`subs COUNT, COUNT, #1`
			`bne 5b`
			`6:`
			`fmxr FPSCR, OLDFPSCR`
			`ldr WINDOW, [fp, #3*4]`
			`ldr OUT, [fp, #4*4]`
			`sub BUF, BUF, #32*4`
			`NOVFP ldr SCALEINT, [fp, #6*4]`
			`mov COUNT, #8`
			`VFP vpush {SCALE}`
			`VFP sub sp, sp, #3*4`
			`NOVFP sub sp, sp, #4*4`
			`7:`
			`VFP ldr a1, [fp, #-7*4] @ imdct`
			`NOVFP ldr a1, [fp, #-8*4]`
			`ldmia fp, {a2-a4}`
			`VFP stmia sp, {WINDOW, OUT, BUF}`
			`NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}`
			`VFP vldr SCALE, [sp, #3*4]`
arm: Mangle external symbols properly in new vfp assembly files Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-22 09:24:43 +00:00			`bl X(ff_synth_filter_float_vfp)`
arm: Add VFP-accelerated version of qmf_32_subbands Before After Mean StdDev Mean StdDev Change This function 1323.0 98.0 746.2 60.6 +77.3% Overall 15400.0 336.4 14147.5 288.4 +8.9% Signed-off-by: Martin Storsjö <martin@martin.st> 2013-07-15 17:28:17 +00:00			`add OUT, OUT, #32*4`
			`add BUF, BUF, #32*4`
			`subs COUNT, COUNT, #1`
			`bne 7b`

			`A sub sp, fp, #(8+8)*4`
			`T sub fp, fp, #(8+8)*4`
			`T mov sp, fp`
			`vpop {s16-s23}`
			`VFP pop {a3-a4,v1-v3,v5,fp,pc}`
			`NOVFP pop {a4,v1-v5,fp,pc}`
			`endfunc`

			`.unreq IN`
			`.unreq SBACT`
			`.unreq OLDFPSCR`
			`.unreq IMDCT`
			`.unreq WINDOW`
			`.unreq OUT`
			`.unreq BUF`
			`.unreq SCALEINT`
			`.unreq COUNT`

			`.unreq SCALE`

			`.align 2`
			`zero: .word 0`