mirror of https://git.ffmpeg.org/ffmpeg.git
lavu/float_dsp: rework RISC-V V scalar product
1) Take the reductive sum out of the loop, leaving a regular vector addition in the loop. 2) Merge the addition and the multiplication. 3) Unroll. Before: scalarproduct_float_rvv_f32: 832.5 After: scalarproduct_float_rvv_f32: 275.2
This commit is contained in:
parent
b710f881ce
commit
29b9d616c2
|
@ -166,20 +166,22 @@ endfunc
|
|||
|
||||
// a0 = (a0).(a1) [0..a2-1]
|
||||
func ff_scalarproduct_float_rvv, zve32f
|
||||
vsetivli zero, 1, e32, m1, ta, ma
|
||||
vmv.s.x v8, zero
|
||||
vsetvli t0, zero, e32, m8, ta, ma
|
||||
vmv.v.x v8, zero
|
||||
vmv.s.x v0, zero
|
||||
1:
|
||||
vsetvli t0, a2, e32, m1, ta, ma
|
||||
vsetvli t0, a2, e32, m8, tu, ma
|
||||
vle32.v v16, (a0)
|
||||
sub a2, a2, t0
|
||||
vle32.v v24, (a1)
|
||||
sh2add a0, t0, a0
|
||||
vfmul.vv v16, v16, v24
|
||||
vfmacc.vv v8, v16, v24
|
||||
sh2add a1, t0, a1
|
||||
vfredusum.vs v8, v16, v8
|
||||
bnez a2, 1b
|
||||
|
||||
vfmv.f.s fa0, v8
|
||||
vsetvli t0, zero, e32, m8, ta, ma
|
||||
vfredusum.vs v0, v8, v0
|
||||
vfmv.f.s fa0, v0
|
||||
NOHWF fmv.x.w a0, fa0
|
||||
ret
|
||||
endfunc
|
||||
|
|
Loading…
Reference in New Issue