mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-24 00:02:52 +00:00
lavc/opusdsp: simplify R-V V postfilter
This skips the round-trip to scalar register for the sliding 'x' coefficients, improving performance by about 5%. The trick here is that the vector slide-up instruction preserves elements in destination vector until the slide offset. The switch from vfslide1up.vf to vslideup.vi also allows the elimination of data dependencies on consecutive slides. Since the specifications recommend sticking to power of two offsets, we could slide as follows: vslideup.vi v8, v0, 2 vslideup.vi v4, v0, 1 vslideup.vi v12, v8, 1 vslideup.vi v16, v8, 2 However in the device under test, this seems to make performance slightly worse, so this is left for (in)validation with future better hardware.
This commit is contained in:
parent
04cb307508
commit
db32f75c63
@ -26,40 +26,34 @@ func ff_opus_postfilter_rvv, zve32f
|
|||||||
flw fa1, 4(a2) // g1
|
flw fa1, 4(a2) // g1
|
||||||
sub t0, a0, t1
|
sub t0, a0, t1
|
||||||
flw fa2, 8(a2) // g2
|
flw fa2, 8(a2) // g2
|
||||||
|
addi t1, t0, -2 * 4 // data - (period + 2) = initial &x4
|
||||||
|
vsetivli zero, 4, e32, m4, ta, ma
|
||||||
addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0
|
addi t0, t0, 2 * 4 // data - (period - 2) = initial &x0
|
||||||
|
vle32.v v16, (t1)
|
||||||
flw ft4, -16(t0)
|
|
||||||
addi t3, a1, -2 // maximum parallelism w/o stepping our tail
|
addi t3, a1, -2 // maximum parallelism w/o stepping our tail
|
||||||
flw ft3, -12(t0)
|
|
||||||
flw ft2, -8(t0)
|
|
||||||
flw ft1, -4(t0)
|
|
||||||
1:
|
1:
|
||||||
|
vslidedown.vi v8, v16, 2
|
||||||
min t1, a3, t3
|
min t1, a3, t3
|
||||||
|
vslide1down.vx v12, v16, zero
|
||||||
vsetvli t1, t1, e32, m4, ta, ma
|
vsetvli t1, t1, e32, m4, ta, ma
|
||||||
vle32.v v0, (t0) // x0
|
vle32.v v0, (t0) // x0
|
||||||
sub a3, a3, t1
|
sub a3, a3, t1
|
||||||
vle32.v v28, (a0)
|
vslide1down.vx v4, v8, zero
|
||||||
sh2add t0, t1, t0
|
sh2add t0, t1, t0
|
||||||
vfslide1up.vf v4, v0, ft1
|
vle32.v v28, (a0)
|
||||||
addi t2, t1, -4
|
addi t2, t1, -4
|
||||||
vfslide1up.vf v8, v4, ft2
|
vslideup.vi v4, v0, 1
|
||||||
vfslide1up.vf v12, v8, ft3
|
vslideup.vi v8, v4, 1
|
||||||
vfslide1up.vf v16, v12, ft4
|
vslideup.vi v12, v8, 1
|
||||||
|
vslideup.vi v16, v12, 1
|
||||||
vfadd.vv v20, v4, v12
|
vfadd.vv v20, v4, v12
|
||||||
vfadd.vv v24, v0, v16
|
vfadd.vv v24, v0, v16
|
||||||
vslidedown.vx v12, v0, t2
|
vslidedown.vx v16, v0, t2
|
||||||
vfmacc.vf v28, fa0, v8
|
vfmacc.vf v28, fa0, v8
|
||||||
vslidedown.vi v4, v12, 2
|
|
||||||
vfmacc.vf v28, fa1, v20
|
vfmacc.vf v28, fa1, v20
|
||||||
vslide1down.vx v8, v12, zero
|
|
||||||
vfmacc.vf v28, fa2, v24
|
vfmacc.vf v28, fa2, v24
|
||||||
vslide1down.vx v0, v4, zero
|
|
||||||
vse32.v v28, (a0)
|
vse32.v v28, (a0)
|
||||||
vfmv.f.s ft4, v12
|
|
||||||
sh2add a0, t1, a0
|
sh2add a0, t1, a0
|
||||||
vfmv.f.s ft2, v4
|
|
||||||
vfmv.f.s ft3, v8
|
|
||||||
vfmv.f.s ft1, v0
|
|
||||||
bnez a3, 1b
|
bnez a3, 1b
|
||||||
|
|
||||||
ret
|
ret
|
||||||
|
Loading…
Reference in New Issue
Block a user