lavc/h264dsp: optimise R-V V biweight for shorter heights

T-Head C908:
h264_biweight2_8_c:                                    313.7 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          239.5 ( 1.23x)
h264_biweight2_8_rvv_i32:              after            72.7 ( 4.31x)
h264_biweight4_8_c:                                    582.0 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          471.0 ( 1.16x)
h264_biweight4_8_rvv_i32:              after            91.5 ( 6.36x)
h264_biweight8_8_c:                                   1110.0 ( 1.00x)
h264_biweight8_8_rvv_i32:              before          943.3 ( 1.10x)
h264_biweight8_8_rvv_i64:              after           147.0 ( 7.55x)

SpacemiT X60:
h264_biweight2_8_c:                                    311.4 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          363.1 ( 0.83x)
h264_biweight2_8_rvv_i32:              after           103.1 ( 3.02x)
h264_biweight4_8_c:                                    571.9 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          717.4 ( 0.78x)
h264_biweight4_8_rvv_i32:              after            71.8 ( 7.96x)
h264_biweight8_8_c:                                   1103.1 ( 1.00x)
h264_biweight8_8_rvv_i32:              before         1415.2 ( 0.76x)
h264_biweight8_8_rvv_i64:              ater             92.8 (11.88x)
This commit is contained in:
Rémi Denis-Courmont 2024-09-01 18:58:27 +03:00
parent 459a1512f1
commit 6611bf5484
2 changed files with 40 additions and 52 deletions

View File

@ -97,23 +97,30 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const bool zvl128b = ff_rv_vlen_least(128); const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) { if (bit_depth == 8) {
if (zvl128b && (flags & AV_CPU_FLAG_RVB)) if (zvl128b) {
dsp->weight_h264_pixels_tab[0] = if (flags & AV_CPU_FLAG_RVB)
ff_h264_weight_funcs_8_rvv[0].weight; dsp->weight_h264_pixels_tab[0] =
if (flags & AV_CPU_FLAG_RVV_I64) ff_h264_weight_funcs_8_rvv[0].weight;
dsp->biweight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].biweight;
}
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->weight_h264_pixels_tab[1] = dsp->weight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].weight; ff_h264_weight_funcs_8_rvv[1].weight;
dsp->biweight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].biweight;
}
dsp->weight_h264_pixels_tab[2] = dsp->weight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].weight; ff_h264_weight_funcs_8_rvv[2].weight;
dsp->biweight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].biweight;
dsp->weight_h264_pixels_tab[3] = dsp->weight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].weight; ff_h264_weight_funcs_8_rvv[3].weight;
dsp->biweight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].biweight;
} }
if (bit_depth == 8 && zvl128b) { if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++)
dsp->biweight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].biweight;
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv; dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv; dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff = dsp->h264_h_loop_filter_luma_mbaff =

View File

@ -115,68 +115,49 @@ func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x
ret ret
.endif .endif
endfunc endfunc
.endm
.variant_cc ff_h264_biweight_pixels_8_rvv func ff_h264_biweight_pixels\w\()_\depth\()_rvv, zve64x
func ff_h264_biweight_pixels_8_rvv, zve32x lpad 0
li t6, \w
.ifb \b
j ff_h264_biweight_pixels_simple_8_rvv
.else
csrwi vxrm, 2 csrwi vxrm, 2
addi a7, a7, 1 addi a7, a7, 1
ori a7, a7, 1 ori a7, a7, 1
sll a7, a7, a4 sll a7, a7, a4
addi a4, a4, 1 addi a4, a4, 1
1: 1:
mv t0, a0 vsetvli t1, a3, e\b, m2, ta, ma
mv t1, a1 vlse\b\().v v8, (a0), a2
mv t5, t6 sub a3, a3, t1
2: vlse\b\().v v12, (a1), a2
vsetvli t2, a3, e16, m8, ta, ma mul t2, t1, a2
vlsseg2e8.v v0, (t0), a2 vsetvli t0, zero, e16, m4, ta, ma
vlsseg2e8.v v8, (t1), a2 vmv.v.x v16, a7
addi t5, t5, -2 vsetvli zero, zero, e8, m2, ta, ma
vmv.v.x v16, a7 vwmaccsu.vx v16, a5, v8
vmv.v.x v24, a7 add a1, a1, t2
vsetvli zero, zero, e8, m4, ta, ma vwmaccsu.vx v16, a6, v12
vwmaccsu.vx v16, a5, v0 vsetvli zero, zero, e16, m4, ta, ma
vwmaccsu.vx v24, a5, v4
vwmaccsu.vx v16, a6, v8
vwmaccsu.vx v24, a6, v12
vsetvli zero, zero, e16, m8, ta, ma
vmax.vx v16, v16, zero vmax.vx v16, v16, zero
vmax.vx v24, v24, zero vsetvli zero, zero, e8, m2, ta, ma
vsetvli zero, zero, e8, m4, ta, ma vnclipu.wx v8, v16, a4
vnclipu.wx v0, v16, a4 vsetvli zero, t1, e\b, m2, ta, ma
vnclipu.wx v4, v24, a4 vsse\b\().v v8, (a0), a2
vssseg2e8.v v0, (t0), a2 add a0, a0, t2
addi t0, t0, 2 .endif
addi t1, t1, 2
bnez t5, 2b
mul t3, a2, t2
sub a3, a3, t2
add a0, a0, t3
add a1, a1, t3
bnez a3, 1b bnez a3, 1b
ret ret
endfunc endfunc
.endm
h264_weight 8, 2, 16 h264_weight 8, 2, 16
h264_weight 8, 4, 32 h264_weight 8, 4, 32
h264_weight 8, 8, 64 h264_weight 8, 8, 64
h264_weight 8, 16 h264_weight 8, 16
.irp w, 16, 8, 4, 2
func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
lpad 0
li t6, \w
.if \w == 16
j ff_h264_biweight_pixels_simple_8_rvv
.else
j ff_h264_biweight_pixels_8_rvv
.endif
endfunc
.endr
.global ff_h264_weight_funcs_8_rvv .global ff_h264_weight_funcs_8_rvv
.hidden ff_h264_weight_funcs_8_rvv .hidden ff_h264_weight_funcs_8_rvv
const ff_h264_weight_funcs_8_rvv const ff_h264_weight_funcs_8_rvv