lavc/h264dsp: R-V V 8-bit h264_biweight_pixels

T-Head C908:
h264_biweight2_8_c:        58.0
h264_biweight2_8_rvv_i32:  11.2
h264_biweight4_8_c:       106.0
h264_biweight4_8_rvv_i32:  22.7
h264_biweight8_8_c:       205.7
h264_biweight8_8_rvv_i32:  50.0
h264_biweight16_8_c:      403.5
h264_biweight16_8_rvv_i32: 83.2

SpacemiT X60:
h264_weight2_8_c:          48.2
h264_weight2_8_rvv_i32:     8.2
h264_weight4_8_c:          90.5
h264_weight4_8_rvv_i32:    16.5
h264_weight8_8_c:         175.2
h264_weight8_8_rvv_i32:    38.0
h264_weight16_8_c:        342.2
h264_weight16_8_rvv_i32:   66.0
This commit is contained in:
Rémi Denis-Courmont 2024-07-05 23:03:28 +03:00
parent 3606e592ea
commit f1ed351d3b
2 changed files with 98 additions and 4 deletions

View File

@ -28,7 +28,10 @@
#include "libavutil/riscv/cpu.h"
#include "libavcodec/h264dsp.h"
extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
extern const struct {
const h264_weight_func weight;
const h264_biweight_func biweight;
} ff_h264_weight_funcs_8_rvv[];
void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
@ -63,9 +66,12 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
if (bit_depth == 8 && ff_rv_vlen_least(128)) {
memcpy(dsp->weight_h264_pixels_tab,
ff_h264_weight_funcs_8_rvv,
sizeof (dsp->weight_h264_pixels_tab));
for (int i = 0; i < 4; i++) {
dsp->weight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].weight;
dsp->biweight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].biweight;
}
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;

View File

@ -48,6 +48,35 @@ func ff_h264_weight_pixels_simple_8_rvv, zve32x
ret
endfunc
.variant_cc ff_h264_biweight_pixels_simple_8_rvv
func ff_h264_biweight_pixels_simple_8_rvv, zve32x
csrwi vxrm, 2
addi a7, a7, 1
ori a7, a7, 1
sll a7, a7, a4
1:
vsetvli zero, t6, e32, m4, ta, ma
vle8.v v8, (a0)
addi a3, a3, -1
vle8.v v12, (a1)
add a1, a1, a2
vmv.v.x v16, a7
vsetvli zero, zero, e16, m2, ta, ma
vzext.vf2 v24, v8
vzext.vf2 v28, v12
vwmaccsu.vx v16, a5, v24
vwmaccsu.vx v16, a6, v28
vnclip.wx v16, v16, a4
vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 1
vse8.v v8, (a0)
add a0, a0, a2
bnez a3, 1b
ret
endfunc
func ff_h264_weight_pixels_8_rvv, zve32x
csrwi vxrm, 0
sll a5, a5, a3
@ -84,6 +113,53 @@ func ff_h264_weight_pixels_8_rvv, zve32x
ret
endfunc
.variant_cc ff_h264_biweight_pixels_8_rvv
func ff_h264_biweight_pixels_8_rvv, zve32x
csrwi vxrm, 2
addi a7, a7, 1
ori a7, a7, 1
sll a7, a7, a4
1:
mv t0, a0
mv t1, a1
mv t5, t6
2:
vsetvli t2, a3, e32, m8, ta, ma
vlsseg2e8.v v0, (t0), a2
vlsseg2e8.v v4, (t1), a2
addi t5, t5, -2
vmv.v.x v16, a7
vmv.v.x v24, a7
vsetvli zero, zero, e16, m4, ta, ma
vzext.vf2 v8, v0
vzext.vf2 v12, v2
vwmaccsu.vx v16, a5, v8
vwmaccsu.vx v24, a5, v12
vzext.vf2 v8, v4
vzext.vf2 v12, v6
vwmaccsu.vx v16, a6, v8
vwmaccsu.vx v24, a6, v12
vnclip.wx v8, v16, a4
vnclip.wx v12, v24, a4
vmax.vx v8, v8, zero
vmax.vx v12, v12, zero
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, 1
vnclipu.wi v2, v12, 1
vssseg2e8.v v0, (t0), a2
addi t0, t0, 2
addi t1, t1, 2
bnez t5, 2b
mul t3, a2, t2
sub a3, a3, t2
add a0, a0, t3
add a1, a1, t3
bnez a3, 1b
ret
endfunc
.irp w, 16, 8, 4, 2
func ff_h264_weight_pixels\w\()_8_rvv, zve32x
li a6, \w
@ -93,6 +169,15 @@ func ff_h264_weight_pixels\w\()_8_rvv, zve32x
j ff_h264_weight_pixels_8_rvv
.endif
endfunc
func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
li t6, \w
.if \w == 16
j ff_h264_biweight_pixels_simple_8_rvv
.else
j ff_h264_biweight_pixels_8_rvv
.endif
endfunc
.endr
.global ff_h264_weight_funcs_8_rvv
@ -101,10 +186,13 @@ const ff_h264_weight_funcs_8_rvv
.irp w, 16, 8, 4, 2
#if __riscv_xlen == 32
.word ff_h264_weight_pixels\w\()_8_rvv
.word ff_h264_biweight_pixels\w\()_8_rvv
#elif __riscv_xlen == 64
.dword ff_h264_weight_pixels\w\()_8_rvv
.dword ff_h264_biweight_pixels\w\()_8_rvv
#else
.qword ff_h264_weight_pixels\w\()_8_rvv
.qword ff_h264_biweight_pixels\w\()_8_rvv
#endif
.endr
endconst