lavc/lpc: R-V V compute_autocorr

The loop iterates over the length of the vector, not the order. This is
to avoid reloading the same data for each lag value. However this means
the loop only works if the maximum order is no larger than VLENB.

The loop is roughly equivalent to:

    for (size_t j = 0; j < lag; j++)
        autoc[j] = 1.;

    while (len > lag) {
        for (ptrdiff_t j = 0; j < lag; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

    while (len > 0) {
        for (ptrdiff_t j = 0; j < len; j++)
            autoc[j] += data[j] * *data;
        data++;
        len--;
    }

Since register pressure is only at 50%, it should be possible to implement
the same loop for order up to 2xVLENB. But this is left for future work.

Performance numbers are all over the place from ~1.25x to ~4x speedups,
but at least they are always noticeably better than nothing.
This commit is contained in:
Rémi Denis-Courmont 2023-12-08 21:38:20 +02:00
parent 1a04959532
commit 918b3ed2d5
2 changed files with 36 additions and 1 deletions

View File

@ -22,16 +22,22 @@
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/riscv/cpu.h"
#include "libavcodec/lpc.h"
void ff_lpc_apply_welch_window_rvv(const int32_t *, ptrdiff_t, double *);
void ff_lpc_compute_autocorr_rvv(const double *, ptrdiff_t, int, double *);
av_cold void ff_lpc_init_riscv(LPCContext *c)
{
#if HAVE_RVV && (__riscv_xlen >= 64)
int flags = av_get_cpu_flags();
if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR))
if ((flags & AV_CPU_FLAG_RVV_F64) && (flags & AV_CPU_FLAG_RVB_ADDR)) {
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_rvv;
if (ff_get_rv_vlenb() >= c->max_order)
c->lpc_compute_autocorr = ff_lpc_compute_autocorr_rvv;
}
#endif
}

View File

@ -85,4 +85,33 @@ func ff_lpc_apply_welch_window_rvv, zve64d
ret
endfunc
func ff_lpc_compute_autocorr_rvv, zve64d
li t0, 1
vsetvli zero, a2, e64, m8, ta, ma
fcvt.d.l ft0, t0
vle64.v v0, (a0)
sh3add a0, a2, a0 # data += lag
vfmv.v.f v16, ft0
bge a2, a1, 2f
1:
vfmv.f.s ft0, v0
fld ft1, (a0) # ft1 = data[lag + i]
vfmacc.vf v16, ft0, v0 # v16[j] += data[i] * data[i + j]
addi a1, a1, -1
vfslide1down.vf v0, v0, ft1
addi a0, a0, 8
bgt a1, a2, 1b # while (len > lag);
2:
vfmv.f.s ft0, v0
vsetvli zero, a1, e64, m8, tu, ma
vfmacc.vf v16, ft0, v0
addi a1, a1, -1
vslide1down.vx v0, v0, zero
bnez a1, 2b # while (len > 0);
vsetvli zero, a2, e64, m8, ta, ma
vse64.v v16, (a3)
ret
endfunc
#endif