lavc/h264dsp: R-V V 8-bit h264_weight_pixels

There are two implementations here:
- a generic scalable one processing two columns at a time,
- a specialised processing one (fixed-size) row at a time.

Unsurprisingly, the generic one works out better with smaller widths.
With larger widths, the gains from filling vectors are outweighed by
the extra cost of strided loads and stores. In other words, memory
accesses become the bottleneck.

T-Head C908:
h264_weight2_8_c:        54.5
h264_weight2_8_rvv_i32:  13.7
h264_weight4_8_c:       101.7
h264_weight4_8_rvv_i32:  27.5
h264_weight8_8_c:       197.0
h264_weight8_8_rvv_i32:  75.5
h264_weight16_8_c:      385.0
h264_weight16_8_rvv_i32: 74.2

SpacemiT X60:
h264_weight2_8_c:        48.5
h264_weight2_8_rvv_i32:   8.2
h264_weight4_8_c:        90.7
h264_weight4_8_rvv_i32:  16.5
h264_weight8_8_c:       175.0
h264_weight8_8_rvv_i32:  37.7
h264_weight16_8_c:      342.2
h264_weight16_8_rvv_i32: 66.0
This commit is contained in:
Rémi Denis-Courmont 2024-07-04 21:38:48 +03:00
parent 85706f5136
commit 3606e592ea
2 changed files with 90 additions and 0 deletions

View File

@ -21,12 +21,15 @@
#include "config.h"
#include <stdint.h>
#include <string.h>
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/riscv/cpu.h"
#include "libavcodec/h264dsp.h"
extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@ -60,6 +63,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
if (bit_depth == 8 && ff_rv_vlen_least(128)) {
memcpy(dsp->weight_h264_pixels_tab,
ff_h264_weight_funcs_8_rvv,
sizeof (dsp->weight_h264_pixels_tab));
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =

View File

@ -26,6 +26,89 @@
#include "libavutil/riscv/asm.S"
func ff_h264_weight_pixels_simple_8_rvv, zve32x
csrwi vxrm, 0
sll a5, a5, a3
1:
vsetvli zero, a6, e32, m4, ta, ma
vle8.v v8, (a0)
addi a2, a2, -1
vmv.v.x v16, a5
vsetvli zero, zero, e16, m2, ta, ma
vzext.vf2 v24, v8
vwmaccsu.vx v16, a4, v24
vnclip.wx v16, v16, a3
vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 0
vse8.v v8, (a0)
add a0, a0, a1
bnez a2, 1b
ret
endfunc
func ff_h264_weight_pixels_8_rvv, zve32x
csrwi vxrm, 0
sll a5, a5, a3
1:
mv t0, a0
mv t6, a6
2:
vsetvli t2, a2, e32, m8, ta, ma
vlsseg2e8.v v0, (t0), a1
addi t6, t6, -2
vmv.v.x v16, a5
vmv.v.x v24, a5
vsetvli zero, zero, e16, m4, ta, ma
vzext.vf2 v8, v0
vzext.vf2 v12, v2
vwmaccsu.vx v16, a4, v8
vwmaccsu.vx v24, a4, v12
vnclip.wx v8, v16, a3
vnclip.wx v12, v24, a3
vmax.vx v8, v8, zero
vmax.vx v12, v12, zero
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, 0
vnclipu.wi v2, v12, 0
vssseg2e8.v v0, (t0), a1
addi t0, t0, 2
bnez t6, 2b
mul t3, a1, t2
sub a2, a2, t2
add a0, a0, t3
bnez a2, 1b
ret
endfunc
.irp w, 16, 8, 4, 2
func ff_h264_weight_pixels\w\()_8_rvv, zve32x
li a6, \w
.if \w == 16
j ff_h264_weight_pixels_simple_8_rvv
.else
j ff_h264_weight_pixels_8_rvv
.endif
endfunc
.endr
.global ff_h264_weight_funcs_8_rvv
.hidden ff_h264_weight_funcs_8_rvv
const ff_h264_weight_funcs_8_rvv
.irp w, 16, 8, 4, 2
#if __riscv_xlen == 32
.word ff_h264_weight_pixels\w\()_8_rvv
#elif __riscv_xlen == 64
.dword ff_h264_weight_pixels\w\()_8_rvv
#else
.qword ff_h264_weight_pixels\w\()_8_rvv
#endif
.endr
endconst
.variant_cc ff_h264_loop_filter_luma_8_rvv
func ff_h264_loop_filter_luma_8_rvv, zve32x
# p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13