lavc/mpegvideoencdsp: R-V V try_8x8basis

T-Head C908:
try_8x8basis_c:       922.5
try_8x8basis_rvv_i32: 135.3

SpacemiT X60:
try_8x8basis_c:       926.1
try_8x8basis_rvv_i32: 103.1
This commit is contained in:
Rémi Denis-Courmont 2024-08-14 19:56:05 +03:00
parent 0fd37c00d7
commit 1907dd7f23
2 changed files with 46 additions and 4 deletions

View File

@ -23,6 +23,8 @@
#include "libavutil/cpu.h" #include "libavutil/cpu.h"
#include "libavcodec/mpegvideoencdsp.h" #include "libavcodec/mpegvideoencdsp.h"
int ff_try_8x8basis_rvv(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[16], int scale);
int ff_pix_sum_rvv(const uint8_t *pix, int line_size); int ff_pix_sum_rvv(const uint8_t *pix, int line_size);
int ff_pix_norm1_rvv(const uint8_t *pix, int line_size); int ff_pix_norm1_rvv(const uint8_t *pix, int line_size);
@ -32,10 +34,15 @@ av_cold void ff_mpegvideoencdsp_init_riscv(MpegvideoEncDSPContext *c,
#if HAVE_RVV #if HAVE_RVV
int flags = av_get_cpu_flags(); int flags = av_get_cpu_flags();
if (flags & AV_CPU_FLAG_RVV_I32) {
if (flags & AV_CPU_FLAG_RVB)
c->try_8x8basis = ff_try_8x8basis_rvv;
if (flags & AV_CPU_FLAG_RVV_I64) { if (flags & AV_CPU_FLAG_RVV_I64) {
if ((flags & AV_CPU_FLAG_RVB) && ff_rv_vlen_least(128)) if ((flags & AV_CPU_FLAG_RVB) && ff_rv_vlen_least(128))
c->pix_sum = ff_pix_sum_rvv; c->pix_sum = ff_pix_sum_rvv;
c->pix_norm1 = ff_pix_norm1_rvv; c->pix_norm1 = ff_pix_norm1_rvv;
} }
}
#endif #endif
} }

View File

@ -20,6 +20,41 @@
#include "libavutil/riscv/asm.S" #include "libavutil/riscv/asm.S"
.equ BASIS_SHIFT, 16
.equ RECON_SHIFT, 6
func ff_try_8x8basis_rvv, zve32x, b
li t1, 64
csrwi vxrm, 0
vsetvli t0, t1, e32, m8, ta, ma
vmv.v.x v24, zero
vmv.s.x v1, zero
1:
vsetvli zero, zero, e16, m4, ta, ma
vle16.v v4, (a2)
sub t1, t1, t0
vwmul.vx v16, v4, a3
sh1add a2, t0, a2
vle16.v v8, (a0)
sh1add a0, t0, a0
vnclip.wi v4, v16, BASIS_SHIFT - RECON_SHIFT
vle16.v v12, (a1)
sh1add a1, t0, a1
vadd.vv v4, v8, v4
vsra.vi v4, v4, RECON_SHIFT
vwmul.vv v16, v12, v4
vsetvli zero, zero, e32, m8, ta, ma
vmul.vv v16, v16, v16
vsra.vi v16, v16, 4
vadd.vv v24, v24, v16
bnez t1, 1b
vredsum.vs v1, v24, v1
vmv.x.s a0, v1
srai a0, a0, 2
ret
endfunc
func ff_pix_sum_rvv, zve64x, b func ff_pix_sum_rvv, zve64x, b
lpad 0 lpad 0
vsetivli t0, 16, e16, m1, ta, ma vsetivli t0, 16, e16, m1, ta, ma