mirror of https://git.ffmpeg.org/ffmpeg.git
lavu/float_dsp: rework RISC-V V scalar product
1) Take the reductive sum out of the loop, leaving a regular vector addition in the loop. 2) Merge the addition and the multiplication. 3) Unroll. Before: scalarproduct_float_rvv_f32: 832.5 After: scalarproduct_float_rvv_f32: 275.2
This commit is contained in:
parent
b710f881ce
commit
29b9d616c2
|
@ -166,20 +166,22 @@ endfunc
|
||||||
|
|
||||||
// a0 = (a0).(a1) [0..a2-1]
|
// a0 = (a0).(a1) [0..a2-1]
|
||||||
func ff_scalarproduct_float_rvv, zve32f
|
func ff_scalarproduct_float_rvv, zve32f
|
||||||
vsetivli zero, 1, e32, m1, ta, ma
|
vsetvli t0, zero, e32, m8, ta, ma
|
||||||
vmv.s.x v8, zero
|
vmv.v.x v8, zero
|
||||||
|
vmv.s.x v0, zero
|
||||||
1:
|
1:
|
||||||
vsetvli t0, a2, e32, m1, ta, ma
|
vsetvli t0, a2, e32, m8, tu, ma
|
||||||
vle32.v v16, (a0)
|
vle32.v v16, (a0)
|
||||||
sub a2, a2, t0
|
sub a2, a2, t0
|
||||||
vle32.v v24, (a1)
|
vle32.v v24, (a1)
|
||||||
sh2add a0, t0, a0
|
sh2add a0, t0, a0
|
||||||
vfmul.vv v16, v16, v24
|
vfmacc.vv v8, v16, v24
|
||||||
sh2add a1, t0, a1
|
sh2add a1, t0, a1
|
||||||
vfredusum.vs v8, v16, v8
|
|
||||||
bnez a2, 1b
|
bnez a2, 1b
|
||||||
|
|
||||||
vfmv.f.s fa0, v8
|
vsetvli t0, zero, e32, m8, ta, ma
|
||||||
|
vfredusum.vs v0, v8, v0
|
||||||
|
vfmv.f.s fa0, v0
|
||||||
NOHWF fmv.x.w a0, fa0
|
NOHWF fmv.x.w a0, fa0
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
Loading…
Reference in New Issue