From 0183c2c83091b727f3b881c8797b45c8f6915f1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <remi@remlab.net>
Date: Sun, 19 Nov 2023 17:50:49 +0200
Subject: [PATCH] lavc/aacpsdsp: use LMUL=2 and amortise strides

The input is laid out in 16 segments, of which 13 actually need to be
loaded. There are no really efficient ways to deal with this:
1) If we load 8 segments wit unit stride, then narrow to 16 segments with
   right shifts, we can only get one half-size vector per segment, or just 2
   elements per vector (EMUL=1/2) - at least with 128-bit vectors.
   This ends up unsurprisingly about as fas as the C code.
2) The current approach is to load with strides. We keep that approach,
   but improve it using three 4-segmented loads instead of 12 single-segment
   loads. This divides the number of distinct loaded addresses by 4.
3) A potential third approach would be to avoid segmentation altogether
   and splat the scalar coefficient into vectors. Then we can use a
   unit-stride and maximum EMUL. But the downside then is that we have to
   multiply the 3 (of 16) unused segments with zero as part of the
   multiply-accumulate operations.

In addition, we also reuse vectors mid-loop so as to increase the EMUL
from 1 to 2, which also improves performance a little bit.

Oeverall the gains are quite small with the device under test, as it does
not deal with segmented loads very well. But at least the code is tidier,
and should enjoy bigger speed-ups on better hardware implementation.

Before:
ps_hybrid_analysis_c:       1819.2
ps_hybrid_analysis_rvv_f32: 1037.0 (before)
ps_hybrid_analysis_rvv_f32:  990.0 (after)
---
 libavcodec/riscv/aacpsdsp_rvv.S | 61 +++++++++++----------------------
 1 file changed, 20 insertions(+), 41 deletions(-)

diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index 1dc426e01c..f46b35fe91 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -85,63 +85,42 @@ NOHWD   fsw     fs\n, (4 * \n)(sp)
         flw     fs4, (4 * ((6 * 2) + 0))(a1)
         flw     fs5, (4 * ((6 * 2) + 1))(a1)
 
-        add        a2, a2, 6 * 2 * 4 // point to filter[i][6][0]
+        add     t2, a2, 6 * 2 * 4 // point to filter[i][6][0]
         li         t4, 8 * 2 * 4 // filter byte stride
         slli       a3, a3, 3 // output byte stride
 1:
         .macro filter, vs0, vs1, fo0, fo1, fo2, fo3
         vfmacc.vf  v8, \fo0, \vs0
-        vfmacc.vf  v9, \fo2, \vs0
+        vfmacc.vf  v10, \fo2, \vs0
         vfnmsac.vf v8, \fo1, \vs1
-        vfmacc.vf  v9, \fo3, \vs1
+        vfmacc.vf  v10, \fo3, \vs1
         .endm
 
-        vsetvli    t0, a4, e32, m1, ta, ma
+        vsetvli    t0, a4, e32, m2, ta, ma
         /*
          * The filter (a2) has 16 segments, of which 13 need to be extracted.
          * R-V V supports only up to 8 segments, so unrolling is unavoidable.
          */
-        addi       t1, a2, -48
-        vlse32.v   v22, (a2), t4
-        addi       t2, a2, -44
-        vlse32.v   v16, (t1), t4
-        addi       t1, a2, -40
-        vfmul.vf   v8, v22, fs4
-        vlse32.v   v24, (t2), t4
-        addi       t2, a2, -36
-        vfmul.vf   v9, v22, fs5
-        vlse32.v   v17, (t1), t4
-        addi       t1, a2, -32
-        vlse32.v   v25, (t2), t4
-        addi       t2, a2, -28
-        filter     v16, v24, ft0, ft1, ft2, ft3
-        vlse32.v   v18, (t1), t4
-        addi       t1, a2, -24
-        vlse32.v   v26, (t2), t4
-        addi       t2, a2, -20
-        filter     v17, v25, ft4, ft5, ft6, ft7
-        vlse32.v   v19, (t1), t4
-        addi       t1, a2, -16
-        vlse32.v   v27, (t2), t4
-        addi       t2, a2, -12
-        filter     v18, v26, ft8, ft9, ft10, ft11
-        vlse32.v   v20, (t1), t4
-        addi       t1, a2, -8
         vlse32.v   v28, (t2), t4
-        addi       t2, a2, -4
-        filter     v19, v27, fa0, fa1, fa2, fa3
-        vlse32.v   v21, (t1), t4
+        addi       t1, a2, 16
+        vfmul.vf   v8, v28, fs4
+        vlsseg4e32.v v16, (a2), t4
+        vfmul.vf   v10, v28, fs5
+        filter     v16, v18, ft0, ft1, ft2, ft3
+        vlsseg4e32.v v24, (t1), t4
+        filter     v20, v22, ft4, ft5, ft6, ft7
+        addi       t1, a2, 32
+        filter     v24, v26, ft8, ft9, ft10, ft11
+        vlsseg4e32.v v16, (t1), t4
         sub        a4, a4, t0
-        vlse32.v   v29, (t2), t4
+        filter     v28, v30, fa0, fa1, fa2, fa3
         slli       t1, t0, 3 + 1 + 2 // ctz(8 * 2 * 4)
-        add        a2, a2, t1
-        filter     v20, v28, fa4, fa5, fa6, fa7
-        filter     v21, v29, fs0, fs1, fs2, fs3
-
-        add        t2, a0, 4
-        vsse32.v   v8, (a0), a3
+        filter     v16, v18, fa4, fa5, fa6, fa7
         mul        t0, t0, a3
-        vsse32.v   v9, (t2), a3
+        filter     v20, v22, fs0, fs1, fs2, fs3
+        add        a2, a2, t1
+        add        t2, t2, t1
+        vssseg2e32.v v8, (a0), a3
         add        a0, a0, t0
         bnez       a4, 1b