lavc/h264dsp: optimise R-V V biweight for shorter heights

T-Head C908:
h264_biweight2_8_c:                                    313.7 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          239.5 ( 1.23x)
h264_biweight2_8_rvv_i32:              after            72.7 ( 4.31x)
h264_biweight4_8_c:                                    582.0 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          471.0 ( 1.16x)
h264_biweight4_8_rvv_i32:              after            91.5 ( 6.36x)
h264_biweight8_8_c:                                   1110.0 ( 1.00x)
h264_biweight8_8_rvv_i32:              before          943.3 ( 1.10x)
h264_biweight8_8_rvv_i64:              after           147.0 ( 7.55x)

SpacemiT X60:
h264_biweight2_8_c:                                    311.4 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          363.1 ( 0.83x)
h264_biweight2_8_rvv_i32:              after           103.1 ( 3.02x)
h264_biweight4_8_c:                                    571.9 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          717.4 ( 0.78x)
h264_biweight4_8_rvv_i32:              after            71.8 ( 7.96x)
h264_biweight8_8_c:                                   1103.1 ( 1.00x)
h264_biweight8_8_rvv_i32:              before         1415.2 ( 0.76x)
h264_biweight8_8_rvv_i64:              ater             92.8 (11.88x)
This commit is contained in:
Rémi Denis-Courmont 2024-09-01 18:58:27 +03:00
parent 459a1512f1
commit 6611bf5484
2 changed files with 40 additions and 52 deletions

View File

@ -97,23 +97,30 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) {
if (zvl128b && (flags & AV_CPU_FLAG_RVB))
dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight;
if (flags & AV_CPU_FLAG_RVV_I64)
if (zvl128b) {
if (flags & AV_CPU_FLAG_RVB)
dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight;
dsp->biweight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].biweight;
}
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->weight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].weight;
dsp->biweight_h264_pixels_tab[1] =
ff_h264_weight_funcs_8_rvv[1].biweight;
}
dsp->weight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].weight;
dsp->biweight_h264_pixels_tab[2] =
ff_h264_weight_funcs_8_rvv[2].biweight;
dsp->weight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].weight;
dsp->biweight_h264_pixels_tab[3] =
ff_h264_weight_funcs_8_rvv[3].biweight;
}
if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++)
dsp->biweight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].biweight;
dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =

View File

@ -115,68 +115,49 @@ func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x
ret
.endif
endfunc
.endm
.variant_cc ff_h264_biweight_pixels_8_rvv
func ff_h264_biweight_pixels_8_rvv, zve32x
func ff_h264_biweight_pixels\w\()_\depth\()_rvv, zve64x
lpad 0
li t6, \w
.ifb \b
j ff_h264_biweight_pixels_simple_8_rvv
.else
csrwi vxrm, 2
addi a7, a7, 1
ori a7, a7, 1
sll a7, a7, a4
addi a4, a4, 1
1:
mv t0, a0
mv t1, a1
mv t5, t6
2:
vsetvli t2, a3, e16, m8, ta, ma
vlsseg2e8.v v0, (t0), a2
vlsseg2e8.v v8, (t1), a2
addi t5, t5, -2
vmv.v.x v16, a7
vmv.v.x v24, a7
vsetvli zero, zero, e8, m4, ta, ma
vwmaccsu.vx v16, a5, v0
vwmaccsu.vx v24, a5, v4
vwmaccsu.vx v16, a6, v8
vwmaccsu.vx v24, a6, v12
vsetvli zero, zero, e16, m8, ta, ma
vsetvli t1, a3, e\b, m2, ta, ma
vlse\b\().v v8, (a0), a2
sub a3, a3, t1
vlse\b\().v v12, (a1), a2
mul t2, t1, a2
vsetvli t0, zero, e16, m4, ta, ma
vmv.v.x v16, a7
vsetvli zero, zero, e8, m2, ta, ma
vwmaccsu.vx v16, a5, v8
add a1, a1, t2
vwmaccsu.vx v16, a6, v12
vsetvli zero, zero, e16, m4, ta, ma
vmax.vx v16, v16, zero
vmax.vx v24, v24, zero
vsetvli zero, zero, e8, m4, ta, ma
vnclipu.wx v0, v16, a4
vnclipu.wx v4, v24, a4
vssseg2e8.v v0, (t0), a2
addi t0, t0, 2
addi t1, t1, 2
bnez t5, 2b
mul t3, a2, t2
sub a3, a3, t2
add a0, a0, t3
add a1, a1, t3
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wx v8, v16, a4
vsetvli zero, t1, e\b, m2, ta, ma
vsse\b\().v v8, (a0), a2
add a0, a0, t2
.endif
bnez a3, 1b
ret
endfunc
.endm
h264_weight 8, 2, 16
h264_weight 8, 4, 32
h264_weight 8, 8, 64
h264_weight 8, 16
.irp w, 16, 8, 4, 2
func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
lpad 0
li t6, \w
.if \w == 16
j ff_h264_biweight_pixels_simple_8_rvv
.else
j ff_h264_biweight_pixels_8_rvv
.endif
endfunc
.endr
.global ff_h264_weight_funcs_8_rvv
.hidden ff_h264_weight_funcs_8_rvv
const ff_h264_weight_funcs_8_rvv