lavc/ac3dsp: R-V V sum_square_butterfly_int32

ac3_sum_square_bufferfly_int32_c: 61.0 ac3_sum_square_bufferfly_int32_rvv_i64: 14.7
2024-04-29 22:10:15 +03:00 · 2024-04-29 22:10:15 +03:00 · 6459966beb
parent 95568c4e31
commit 6459966beb
2 changed files with 47 additions and 0 deletions
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@ -28,6 +28,8 @@

 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
 void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
+void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
+                                       const int32_t *, int);

 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@ -39,6 +41,10 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
            c->extract_exponents = ff_extract_exponents_rvb;
        if (flags & AV_CPU_FLAG_RVV_F32)
            c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+# if __riscv_xlen >= 64
+        if (flags & AV_CPU_FLAG_RVV_I64)
+            c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;
+# endif
    }
 #endif
 }
--- a/libavcodec/riscv/ac3dsp_rvv.S
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@ -37,3 +37,44 @@ func ff_float_to_fixed24_rvv, zve32f

        ret
 endfunc
+
+#if __riscv_xlen >= 64
+func ff_sum_square_butterfly_int32_rvv, zve64x
+        vsetvli    t0, zero, e64, m8, ta, ma
+        vmv.v.x    v0, zero
+        vmv.v.x    v8, zero
+1:
+        vsetvli    t0, a3, e32, m2, tu, ma
+        vle32.v    v16, (a1)
+        sub        a3, a3, t0
+        vle32.v    v20, (a2)
+        sh2add     a1, t0, a1
+        vadd.vv    v24, v16, v20
+        sh2add     a2, t0, a2
+        vsub.vv    v28, v16, v20
+        vwmacc.vv  v0, v16, v16
+        vwmacc.vv  v4, v20, v20
+        vwmacc.vv  v8, v24, v24
+        vwmacc.vv  v12, v28, v28
+        bnez       a3, 1b
+
+        vsetvli    t0, zero, e64, m4, ta, ma
+        vmv.s.x    v16, zero
+        vmv.s.x    v17, zero
+        vredsum.vs v16, v0, v16
+        vmv.s.x    v18, zero
+        vredsum.vs v17, v4, v17
+        vmv.s.x    v19, zero
+        vredsum.vs v18, v8, v18
+        vmv.x.s    t0, v16
+        vredsum.vs v19, v12, v19
+        vmv.x.s    t1, v17
+        sd         t0,   (a0)
+        vmv.x.s    t2, v18
+        sd         t1,  8(a0)
+        vmv.x.s    t3, v19
+        sd         t2, 16(a0)
+        sd         t3, 24(a0)
+        ret
+endfunc
+#endif