lavc/opusdsp: simplify R-V V postfilter

This skips the round-trip to scalar register for the sliding 'x'
coefficients, improving performance by about 5%. The trick here is that
the vector slide-up instruction preserves elements in destination vector
until the slide offset.

The switch from vfslide1up.vf to vslideup.vi also allows the elimination
of data dependencies on consecutive slides. Since the specifications
recommend sticking to power of two offsets, we could slide as follows:

        vslideup.vi v8, v0, 2
        vslideup.vi v4, v0, 1
        vslideup.vi v12, v8, 1
        vslideup.vi v16, v8, 2

However in the device under test, this seems to make performance slightly
worse, so this is left for (in)validation with future better hardware.
This commit is contained in:
Rémi Denis-Courmont 2023-12-16 10:02:08 +02:00
parent 04cb307508
commit db32f75c63

View File

@ -26,40 +26,34 @@ func ff_opus_postfilter_rvv, zve32f
flw fa1, 4(a2) // g1
sub t0, a0, t1
flw fa2, 8(a2) // g2
addi t1, t0, -2 * 4 // data - (period + 2) = initial &x4
vsetivli zero, 4, e32, m4, ta, ma
addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0
flw ft4, -16(t0)
vle32.v v16, (t1)
addi t3, a1, -2 // maximum parallelism w/o stepping our tail
flw ft3, -12(t0)
flw ft2, -8(t0)
flw ft1, -4(t0)
1:
vslidedown.vi v8, v16, 2
min t1, a3, t3
vslide1down.vx v12, v16, zero
vsetvli t1, t1, e32, m4, ta, ma
vle32.v v0, (t0) // x0
sub a3, a3, t1
vle32.v v28, (a0)
vslide1down.vx v4, v8, zero
sh2add t0, t1, t0
vfslide1up.vf v4, v0, ft1
vle32.v v28, (a0)
addi t2, t1, -4
vfslide1up.vf v8, v4, ft2
vfslide1up.vf v12, v8, ft3
vfslide1up.vf v16, v12, ft4
vslideup.vi v4, v0, 1
vslideup.vi v8, v4, 1
vslideup.vi v12, v8, 1
vslideup.vi v16, v12, 1
vfadd.vv v20, v4, v12
vfadd.vv v24, v0, v16
vslidedown.vx v12, v0, t2
vslidedown.vx v16, v0, t2
vfmacc.vf v28, fa0, v8
vslidedown.vi v4, v12, 2
vfmacc.vf v28, fa1, v20
vslide1down.vx v8, v12, zero
vfmacc.vf v28, fa2, v24
vslide1down.vx v0, v4, zero
vse32.v v28, (a0)
vfmv.f.s ft4, v12
sh2add a0, t1, a0
vfmv.f.s ft2, v4
vfmv.f.s ft3, v8
vfmv.f.s ft1, v0
bnez a3, 1b
ret