From 5b33104fca4057edb21598264ee17e087f10d816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Thu, 9 Nov 2023 21:57:28 +0200 Subject: [PATCH] lavc/sbrdsp: R-V V hf_gen hf_gen_c: 2922.7 hf_gen_rvv_f32: 731.5 --- libavcodec/riscv/sbrdsp_init.c | 4 +++ libavcodec/riscv/sbrdsp_rvv.S | 50 ++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c index c1ed5b639c..e5736452ec 100644 --- a/libavcodec/riscv/sbrdsp_init.c +++ b/libavcodec/riscv/sbrdsp_init.c @@ -27,6 +27,9 @@ void ff_sbr_sum64x5_rvv(float *z); float ff_sbr_sum_square_rvv(float (*x)[2], int n); void ff_sbr_neg_odd_64_rvv(float *x); void ff_sbr_autocorrelate_rvv(const float x[40][2], float phi[3][2][2]); +void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2], + const float alpha0[2], const float alpha1[2], + float bw, int start, int end); void ff_sbr_hf_g_filt_rvv(float (*Y)[2], const float (*X_high)[40][2], const float *g_filt, int m_max, intptr_t ixh); @@ -39,6 +42,7 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) if (flags & AV_CPU_FLAG_RVB_ADDR) { c->sum64x5 = ff_sbr_sum64x5_rvv; c->sum_square = ff_sbr_sum_square_rvv; + c->hf_gen = ff_sbr_hf_gen_rvv; c->hf_g_filt = ff_sbr_hf_g_filt_rvv; } c->autocorrelate = ff_sbr_autocorrelate_rvv; diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S index 2f3a0969d7..43fab1f65f 100644 --- a/libavcodec/riscv/sbrdsp_rvv.S +++ b/libavcodec/riscv/sbrdsp_rvv.S @@ -174,6 +174,56 @@ func ff_sbr_autocorrelate_rvv, zve32f ret endfunc +func ff_sbr_hf_gen_rvv, zve32f +NOHWF fmv.w.x fa0, a4 +NOHWF mv a4, a5 +NOHWF mv a5, a6 + flw ft2, 0(a2) + fmul.s fa1, fa0, fa0 // bw * bw + sh3add a1, a5, a1 + flw ft3, 4(a2) + fmul.s fa2, ft2, fa0 // alpha[2] + sh3add a0, a5, a0 + flw ft0, 0(a3) + fmul.s fa3, ft3, fa0 // alpha[3] + sub a5, a5, a4 + flw ft1, 4(a3) + fmul.s fa0, ft0, fa1 // alpha[0] + flw ft0, -16(a1) // X_low[end - 2][0] + fmul.s fa1, ft1, fa1 // alpha[1] + flw ft1, -12(a1) // X_low[end - 2][1] + flw ft2, -8(a1) // X_low[end - 1][0] + flw ft3, -4(a1) // X_low[end - 1][1] + addi a1, a1, -16 +1: + vsetvli t0, a5, e32, m4, ta, ma + slli t1, t0, 3 + sub a1, a1, t1 + vlseg2e32.v v0, (a1) // X_low[i - 2] + sub a0, a0, t1 + vfslide1down.vf v8, v0, ft0 // X_low[i - 1][0] + sub a5, a5, t0 + vfslide1down.vf v12, v4, ft1 // X_low[i - 1][1] + vfslide1down.vf v16, v8, ft2 // X_low[i ][0] + vfslide1down.vf v20, v12, ft3 // X_low[i ][1] + vfmacc.vf v16, fa0, v0 + vfmacc.vf v20, fa0, v4 + vfmv.f.s ft0, v0 + vfnmsac.vf v16, fa1, v4 + vfmacc.vf v20, fa1, v0 + vfmv.f.s ft1, v4 + vfmacc.vf v16, fa2, v8 + vfmacc.vf v20, fa2, v12 + vfmv.f.s ft2, v8 + vfnmsac.vf v16, fa3, v12 + vfmacc.vf v20, fa3, v8 + vfmv.f.s ft3, v12 + vsseg2e32.v v16, (a0) + bnez a5, 1b + + ret +endfunc + func ff_sbr_hf_g_filt_rvv, zve32f li t1, 40 * 2 * 4 sh3add a1, a4, a1