lavc/flacdsp: R-V V LPC32

The entire set of 32 coefficients and corresponding past 32 samples can
fit in a single vector (with LMUL=8) exactly, but... since widening
double the needed vector sizes, we still end up too short with 128-bit
vectors. This adds a very simple version for future 256+-bit hardware,
and for pred_orders values up to 16, and a bit more involved loop for
for 128-bit hardware with pred_orders between 17 and 32.

With 128-bit hardware, the benchmarks look like this:
flac_lpc_32_13_c:       30152.0
flac_lpc_32_13_rvv_i32: 10244.7
flac_lpc_32_16_c:       37314.2
flac_lpc_32_16_rvv_i32: 10126.2
flac_lpc_32_29_c:       61910.0
flac_lpc_32_29_rvv_i32: 14495.2
flac_lpc_32_32_c:       68204.0
flac_lpc_32_32_rvv_i32: 13273.7
This commit is contained in:
Rémi Denis-Courmont 2023-11-15 19:46:18 +02:00
parent 8a984aca59
commit 295092b46d
2 changed files with 69 additions and 0 deletions

View File

@ -22,8 +22,13 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/riscv/cpu.h"
#include "libavcodec/flacdsp.h"
void ff_flac_lpc32_rvv(int32_t *decoded, const int coeffs[32],
int pred_order, int qlevel, int len);
void ff_flac_lpc32_rvv_simple(int32_t *decoded, const int coeffs[32],
int pred_order, int qlevel, int len);
void ff_flac_decorrelate_indep2_16_rvv(uint8_t **out, int32_t **in,
int channels, int len, int shift);
void ff_flac_decorrelate_indep4_16_rvv(uint8_t **out, int32_t **in,
@ -60,6 +65,13 @@ av_cold void ff_flacdsp_init_riscv(FLACDSPContext *c, enum AVSampleFormat fmt,
int flags = av_get_cpu_flags();
if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
int vlenb = ff_get_rv_vlenb();
if (vlenb == 16)
c->lpc32 = ff_flac_lpc32_rvv;
else if (vlenb > 16)
c->lpc32 = ff_flac_lpc32_rvv_simple;
switch (fmt) {
case AV_SAMPLE_FMT_S16:
switch (channels) {

View File

@ -21,6 +21,63 @@
#include "libavutil/riscv/asm.S"
#if (__riscv_xlen == 64)
func ff_flac_lpc32_rvv, zve32x
addi t2, a2, -16
ble t2, zero, ff_flac_lpc32_rvv_simple
vsetivli zero, 1, e64, m1, ta, ma
vmv.s.x v0, zero
vsetvli zero, a2, e32, m8, ta, ma
vle32.v v8, (a1)
sub a4, a4, a2
vle32.v v16, (a0)
sh2add a0, a2, a0
1:
vsetvli zero, a2, e32, m4, ta, ma
vwmul.vv v24, v8, v16
vsetvli zero, t2, e32, m4, tu, ma
vwmacc.vv v24, v12, v20
vsetvli zero, a2, e64, m8, ta, ma
vredsum.vs v24, v24, v0
lw t0, (a0)
addi a4, a4, -1
vmv.x.s t1, v24
vsetvli zero, a2, e32, m8, ta, ma
sra t1, t1, a3
add t0, t0, t1
vslide1down.vx v16, v16, t0
sw t0, (a0)
addi a0, a0, 4
bnez a4, 1b
ret
endfunc
func ff_flac_lpc32_rvv_simple, zve32x
vsetivli zero, 1, e64, m1, ta, ma
vmv.s.x v0, zero
vsetvli zero, a2, e32, m4, ta, ma
vle32.v v8, (a1)
sub a4, a4, a2
vle32.v v16, (a0)
sh2add a0, a2, a0
1:
vwmul.vv v24, v8, v16
vsetvli zero, zero, e64, m8, ta, ma
vredsum.vs v24, v24, v0
lw t0, (a0)
addi a4, a4, -1
vmv.x.s t1, v24
vsetvli zero, zero, e32, m4, ta, ma
sra t1, t1, a3
add t0, t0, t1
vslide1down.vx v16, v16, t0
sw t0, (a0)
addi a0, a0, 4
bnez a4, 1b
ret
endfunc
func ff_flac_decorrelate_indep2_16_rvv, zve32x
ld a0, (a0)
ld a2, 8(a1)