lavc/ac3dsp: R-V V sum_square_butterfly_float

As we do not need to widen accumulators to 64 bits, we effectively get
double capacity for unrolling compared to the integer function. This
explains the slightly better performance gains.

ac3_sum_square_bufferfly_float_c:       65.2
ac3_sum_square_bufferfly_float_rvv_f32: 12.2
This commit is contained in:
Rémi Denis-Courmont 2024-04-29 22:19:11 +03:00
parent 6459966beb
commit 6cd97cd797
2 changed files with 44 additions and 1 deletions

View File

@ -30,6 +30,8 @@ void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
void ff_sum_square_butterfly_int32_rvv(int64_t *, const int32_t *,
const int32_t *, int);
void ff_sum_square_butterfly_float_rvv(float *, const float *,
const float *, int);
av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
{
@ -39,8 +41,10 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
if (flags & AV_CPU_FLAG_RVB_ADDR) {
if (flags & AV_CPU_FLAG_RVB_BASIC)
c->extract_exponents = ff_extract_exponents_rvb;
if (flags & AV_CPU_FLAG_RVV_F32)
if (flags & AV_CPU_FLAG_RVV_F32) {
c->float_to_fixed24 = ff_float_to_fixed24_rvv;
c->sum_square_butterfly_float = ff_sum_square_butterfly_float_rvv;
}
# if __riscv_xlen >= 64
if (flags & AV_CPU_FLAG_RVV_I64)
c->sum_square_butterfly_int32 = ff_sum_square_butterfly_int32_rvv;

View File

@ -78,3 +78,42 @@ func ff_sum_square_butterfly_int32_rvv, zve64x
ret
endfunc
#endif
func ff_sum_square_butterfly_float_rvv, zve32f
vsetvli t0, zero, e32, m8, ta, ma
vmv.v.x v0, zero
vmv.v.x v8, zero
1:
vsetvli t0, a3, e32, m4, tu, ma
vle32.v v16, (a1)
sub a3, a3, t0
vle32.v v20, (a2)
sh2add a1, t0, a1
vfadd.vv v24, v16, v20
sh2add a2, t0, a2
vfsub.vv v28, v16, v20
vfmacc.vv v0, v16, v16
vfmacc.vv v4, v20, v20
vfmacc.vv v8, v24, v24
vfmacc.vv v12, v28, v28
bnez a3, 1b
vsetvli t0, zero, e32, m4, ta, ma
vmv.s.x v16, zero
vmv.s.x v17, zero
vfredsum.vs v16, v0, v16
vmv.s.x v18, zero
vfredsum.vs v17, v4, v17
vmv.s.x v19, zero
vfredsum.vs v18, v8, v18
vfmv.f.s ft0, v16
vfredsum.vs v19, v12, v19
vfmv.f.s ft1, v17
fsw ft0, (a0)
vfmv.f.s ft2, v18
fsw ft1, 4(a0)
vfmv.f.s ft3, v19
fsw ft2, 8(a0)
fsw ft3, 12(a0)
ret
endfunc