lavc/ac3dsp: R-V V sum_square_butterfly_float

As we do not need to widen accumulators to 64 bits, we effectively get double capacity for unrolling compared to the integer function. This explains the slightly better performance gains. ac3_sum_square_bufferfly_float_c: 65.2 ac3_sum_square_bufferfly_float_rvv_f32: 12.2
2024-04-29 22:19:11 +03:00 · 2024-04-29 22:19:11 +03:00 · 6cd97cd797
parent 6459966beb
commit 6cd97cd797
2 changed files with 44 additions and 1 deletions
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@ -30,6 +30,8 @@ void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
 void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
                                       const int32_t *, int);
+void ff_sum_square_butterfly_float_rvv(float *, const float *,
+                                       const float *, int);

 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@ -39,8 +41,10 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
    if (flags & AV_CPU_FLAG_RVB_ADDR) {
        if (flags & AV_CPU_FLAG_RVB_BASIC)
            c->extract_exponents = ff_extract_exponents_rvb;
-        if (flags & AV_CPU_FLAG_RVV_F32)
+        if (flags & AV_CPU_FLAG_RVV_F32) {
            c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+            c->sum_square_butterfly_float = ff_sum_square_butterfly_float_rvv;
+        }
 # if __riscv_xlen >= 64
        if (flags & AV_CPU_FLAG_RVV_I64)
            c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@ -78,3 +78,42 @@ func ff_sum_square_butterfly_int32_rvv, zve64x
        ret
 endfunc
 #endif
+
+func ff_sum_square_butterfly_float_rvv, zve32f
+        vsetvli     t0, zero, e32, m8, ta, ma
+        vmv.v.x     v0, zero
+        vmv.v.x     v8, zero
+1:
+        vsetvli     t0, a3, e32, m4, tu, ma
+        vle32.v     v16, (a1)
+        sub         a3, a3, t0
+        vle32.v     v20, (a2)
+        sh2add      a1, t0, a1
+        vfadd.vv    v24, v16, v20
+        sh2add      a2, t0, a2
+        vfsub.vv    v28, v16, v20
+        vfmacc.vv   v0, v16, v16
+        vfmacc.vv   v4, v20, v20
+        vfmacc.vv   v8, v24, v24
+        vfmacc.vv   v12, v28, v28
+        bnez        a3, 1b
+
+        vsetvli     t0, zero, e32, m4, ta, ma
+        vmv.s.x     v16, zero
+        vmv.s.x     v17, zero
+        vfredsum.vs v16, v0, v16
+        vmv.s.x     v18, zero
+        vfredsum.vs v17, v4, v17
+        vmv.s.x     v19, zero
+        vfredsum.vs v18, v8, v18
+        vfmv.f.s    ft0, v16
+        vfredsum.vs v19, v12, v19
+        vfmv.f.s    ft1, v17
+        fsw         ft0,   (a0)
+        vfmv.f.s    ft2, v18
+        fsw         ft1,  4(a0)
+        vfmv.f.s    ft3, v19
+        fsw         ft2,  8(a0)
+        fsw         ft3, 12(a0)
+        ret
+endfunc