lavc/aacpsdsp: rework R-V V add_squares

Segmented loads may be slower than not. So this advantageously uses a
unit-strided load and narrowing shifts instead.

Before:
ps_add_squares_c: 60757.7
ps_add_squares_rvv_f32: 22242.5

After:
ps_add_squares_c: 60516.0
ps_add_squares_rvv_i64: 17067.7
This commit is contained in:
Rémi Denis-Courmont 2023-07-15 23:23:50 +03:00
parent ab78d22553
commit eb508702a8
2 changed files with 8 additions and 4 deletions

View File

@ -46,6 +46,7 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
if (flags & AV_CPU_FLAG_RVB_ADDR) {
if (flags & AV_CPU_FLAG_RVV_I64)
c->add_squares = ff_ps_add_squares_rvv;
c->mul_pair_single = ff_ps_mul_pair_single_rvv;
c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;

View File

@ -1,5 +1,5 @@
/*
* Copyright © 2022 Rémi Denis-Courmont.
* Copyright © 2022-2023 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
@ -20,13 +20,16 @@
#include "libavutil/riscv/asm.S"
func ff_ps_add_squares_rvv, zve32f
func ff_ps_add_squares_rvv, zve64f
li t1, 32
1:
vsetvli t0, a2, e32, m4, ta, ma
vlseg2e32.v v24, (a1)
vle64.v v8, (a1)
sub a2, a2, t0
vnsrl.wx v24, v8, zero
vle32.v v16, (a0)
sh3add a1, t0, a1
vnsrl.wx v28, v8, t1
vfmacc.vv v16, v24, v24
vfmacc.vv v16, v28, v28
vse32.v v16, (a0)