lavu/float_dsp: optimise R-V V fmul_reverse & fmul_window

Roll the loop to avoid slow gathers.

Before:
vector_fmul_reverse_c:       1561.7
vector_fmul_reverse_rvv_f32: 2410.2
vector_fmul_window_c:        2068.2
vector_fmul_window_rvv_f32:  1879.5

After:
vector_fmul_reverse_c:       1561.7
vector_fmul_reverse_rvv_f32:  916.2
vector_fmul_window_c:        2068.2
vector_fmul_window_rvv_f32:  1202.5
This commit is contained in:
Rémi Denis-Courmont 2023-11-19 13:36:01 +02:00
parent 3a134e8299
commit e49f41fb27
1 changed files with 8 additions and 6 deletions

View File

@ -75,18 +75,19 @@ endfunc
func ff_vector_fmul_window_rvv, zve32f func ff_vector_fmul_window_rvv, zve32f
// a0: dst, a1: src0, a2: src1, a3: window, a4: length // a0: dst, a1: src0, a2: src1, a3: window, a4: length
vsetvli t0, zero, e16, m2, ta, ma // e16/m2 and e32/m4 are possible but slower due to gather.
vsetvli t0, zero, e16, m1, ta, ma
sh2add a2, a4, a2 sh2add a2, a4, a2
vid.v v0 vid.v v0
sh3add t3, a4, a3 sh3add t3, a4, a3
vadd.vi v0, v0, 1 vadd.vi v0, v0, 1
sh3add t0, a4, a0 sh3add t0, a4, a0
1: 1:
vsetvli t2, a4, e16, m2, ta, ma vsetvli t2, a4, e16, m1, ta, ma
slli t4, t2, 2 slli t4, t2, 2
vrsub.vx v2, v0, t2 vrsub.vx v2, v0, t2
sub t3, t3, t4 sub t3, t3, t4
vsetvli zero, zero, e32, m4, ta, ma vsetvli zero, zero, e32, m2, ta, ma
sub a2, a2, t4 sub a2, a2, t4
vle32.v v8, (t3) vle32.v v8, (t3)
sub t0, t0, t4 sub t0, t0, t4
@ -133,16 +134,17 @@ endfunc
// TODO factor vrsub, separate last iteration? // TODO factor vrsub, separate last iteration?
// (a0) = (a1) * reverse(a2) [0..a3-1] // (a0) = (a1) * reverse(a2) [0..a3-1]
func ff_vector_fmul_reverse_rvv, zve32f func ff_vector_fmul_reverse_rvv, zve32f
vsetvli t0, zero, e16, m4, ta, ma // e16/m4 and e32/m8 are possible but slower due to gather.
vsetvli t0, zero, e16, m1, ta, ma
sh2add a2, a3, a2 sh2add a2, a3, a2
vid.v v0 vid.v v0
vadd.vi v0, v0, 1 vadd.vi v0, v0, 1
1: 1:
vsetvli t0, a3, e16, m4, ta, ma vsetvli t0, a3, e16, m1, ta, ma
slli t1, t0, 2 slli t1, t0, 2
vrsub.vx v4, v0, t0 // v4[i] = [VL-1, VL-2... 1, 0] vrsub.vx v4, v0, t0 // v4[i] = [VL-1, VL-2... 1, 0]
sub a2, a2, t1 sub a2, a2, t1
vsetvli zero, zero, e32, m8, ta, ma vsetvli zero, zero, e32, m2, ta, ma
vle32.v v8, (a2) vle32.v v8, (a2)
sub a3, a3, t0 sub a3, a3, t0
vle32.v v16, (a1) vle32.v v16, (a1)