diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S index 1dc426e01c..f46b35fe91 100644 --- a/libavcodec/riscv/aacpsdsp_rvv.S +++ b/libavcodec/riscv/aacpsdsp_rvv.S @@ -85,63 +85,42 @@ NOHWD fsw fs\n, (4 * \n)(sp) flw fs4, (4 * ((6 * 2) + 0))(a1) flw fs5, (4 * ((6 * 2) + 1))(a1) - add a2, a2, 6 * 2 * 4 // point to filter[i][6][0] + add t2, a2, 6 * 2 * 4 // point to filter[i][6][0] li t4, 8 * 2 * 4 // filter byte stride slli a3, a3, 3 // output byte stride 1: .macro filter, vs0, vs1, fo0, fo1, fo2, fo3 vfmacc.vf v8, \fo0, \vs0 - vfmacc.vf v9, \fo2, \vs0 + vfmacc.vf v10, \fo2, \vs0 vfnmsac.vf v8, \fo1, \vs1 - vfmacc.vf v9, \fo3, \vs1 + vfmacc.vf v10, \fo3, \vs1 .endm - vsetvli t0, a4, e32, m1, ta, ma + vsetvli t0, a4, e32, m2, ta, ma /* * The filter (a2) has 16 segments, of which 13 need to be extracted. * R-V V supports only up to 8 segments, so unrolling is unavoidable. */ - addi t1, a2, -48 - vlse32.v v22, (a2), t4 - addi t2, a2, -44 - vlse32.v v16, (t1), t4 - addi t1, a2, -40 - vfmul.vf v8, v22, fs4 - vlse32.v v24, (t2), t4 - addi t2, a2, -36 - vfmul.vf v9, v22, fs5 - vlse32.v v17, (t1), t4 - addi t1, a2, -32 - vlse32.v v25, (t2), t4 - addi t2, a2, -28 - filter v16, v24, ft0, ft1, ft2, ft3 - vlse32.v v18, (t1), t4 - addi t1, a2, -24 - vlse32.v v26, (t2), t4 - addi t2, a2, -20 - filter v17, v25, ft4, ft5, ft6, ft7 - vlse32.v v19, (t1), t4 - addi t1, a2, -16 - vlse32.v v27, (t2), t4 - addi t2, a2, -12 - filter v18, v26, ft8, ft9, ft10, ft11 - vlse32.v v20, (t1), t4 - addi t1, a2, -8 vlse32.v v28, (t2), t4 - addi t2, a2, -4 - filter v19, v27, fa0, fa1, fa2, fa3 - vlse32.v v21, (t1), t4 + addi t1, a2, 16 + vfmul.vf v8, v28, fs4 + vlsseg4e32.v v16, (a2), t4 + vfmul.vf v10, v28, fs5 + filter v16, v18, ft0, ft1, ft2, ft3 + vlsseg4e32.v v24, (t1), t4 + filter v20, v22, ft4, ft5, ft6, ft7 + addi t1, a2, 32 + filter v24, v26, ft8, ft9, ft10, ft11 + vlsseg4e32.v v16, (t1), t4 sub a4, a4, t0 - vlse32.v v29, (t2), t4 + filter v28, v30, fa0, fa1, fa2, fa3 slli t1, t0, 3 + 1 + 2 // ctz(8 * 2 * 4) - add a2, a2, t1 - filter v20, v28, fa4, fa5, fa6, fa7 - filter v21, v29, fs0, fs1, fs2, fs3 - - add t2, a0, 4 - vsse32.v v8, (a0), a3 + filter v16, v18, fa4, fa5, fa6, fa7 mul t0, t0, a3 - vsse32.v v9, (t2), a3 + filter v20, v22, fs0, fs1, fs2, fs3 + add a2, a2, t1 + add t2, t2, t1 + vssseg2e32.v v8, (a0), a3 add a0, a0, t0 bnez a4, 1b