lavu/float_dsp: avoid reg-stride in R-V V reverse_fmul

This revectors the inner loop to reverse vectors element in vectors,
thus eliminating the negative register stride. Note that RVV does not
have a vector reverse instruction, so this uses a gather.
This commit is contained in:
Rémi Denis-Courmont 2023-09-30 13:08:49 +03:00
parent d14130aea3
commit 446b0090cb
1 changed files with 13 additions and 8 deletions

View File

@ -125,20 +125,25 @@ func ff_vector_fmul_add_rvv, zve32f
ret ret
endfunc endfunc
// TODO factor vrsub, separate last iteration?
// (a0) = (a1) * reverse(a2) [0..a3-1] // (a0) = (a1) * reverse(a2) [0..a3-1]
func ff_vector_fmul_reverse_rvv, zve32f func ff_vector_fmul_reverse_rvv, zve32f
vsetvli t0, zero, e16, m4, ta, ma
sh2add a2, a3, a2 sh2add a2, a3, a2
li t2, -4 // byte stride vid.v v0
addi a2, a2, -4 vadd.vi v0, v0, 1
1: 1:
vsetvli t0, a3, e32, m8, ta, ma vsetvli t0, a3, e16, m4, ta, ma
slli t1, t0, 2 slli t1, t0, 2
vle32.v v16, (a1) vrsub.vx v4, v0, t0 // v4[i] = [VL-1, VL-2... 1, 0]
sub a3, a3, t0
vlse32.v v24, (a2), t2
add a1, a1, t1
vfmul.vv v16, v16, v24
sub a2, a2, t1 sub a2, a2, t1
vsetvli zero, zero, e32, m8, ta, ma
vle32.v v8, (a2)
sub a3, a3, t0
vle32.v v16, (a1)
add a1, a1, t1
vrgatherei16.vv v24, v8, v4 // v24 = reverse(v8)
vfmul.vv v16, v16, v24
vse32.v v16, (a0) vse32.v v16, (a0)
add a0, a0, t1 add a0, a0, t1
bnez a3, 1b bnez a3, 1b