lavu/float_dsp: rework RISC-V V scalar product

1) Take the reductive sum out of the loop,
   leaving a regular vector addition in the loop.
2) Merge the addition and the multiplication.
3) Unroll.

Before:
scalarproduct_float_rvv_f32: 832.5

After:
scalarproduct_float_rvv_f32: 275.2
This commit is contained in:
Rémi Denis-Courmont 2023-07-17 20:11:08 +03:00
parent b710f881ce
commit 29b9d616c2
1 changed files with 8 additions and 6 deletions

View File

@ -166,20 +166,22 @@ endfunc
// a0 = (a0).(a1) [0..a2-1] // a0 = (a0).(a1) [0..a2-1]
func ff_scalarproduct_float_rvv, zve32f func ff_scalarproduct_float_rvv, zve32f
vsetivli zero, 1, e32, m1, ta, ma vsetvli t0, zero, e32, m8, ta, ma
vmv.s.x v8, zero vmv.v.x v8, zero
vmv.s.x v0, zero
1: 1:
vsetvli t0, a2, e32, m1, ta, ma vsetvli t0, a2, e32, m8, tu, ma
vle32.v v16, (a0) vle32.v v16, (a0)
sub a2, a2, t0 sub a2, a2, t0
vle32.v v24, (a1) vle32.v v24, (a1)
sh2add a0, t0, a0 sh2add a0, t0, a0
vfmul.vv v16, v16, v24 vfmacc.vv v8, v16, v24
sh2add a1, t0, a1 sh2add a1, t0, a1
vfredusum.vs v8, v16, v8
bnez a2, 1b bnez a2, 1b
vfmv.f.s fa0, v8 vsetvli t0, zero, e32, m8, ta, ma
vfredusum.vs v0, v8, v0
vfmv.f.s fa0, v0
NOHWF fmv.x.w a0, fa0 NOHWF fmv.x.w a0, fa0
ret ret
endfunc endfunc