lavc/h264dsp: R-V V intra loop filter

As with the inter loop filter, performance metrics seem to be biased in
favour of the C implementation because checkasm inputs almost always
fall in the no-op case.

h264_h_loop_filter_chroma_intra_8bpp_c:                 82.8 ( 1.00x)
h264_h_loop_filter_chroma_intra_8bpp_rvv_i32:           72.6 ( 1.14x)
h264_h_loop_filter_chroma_mbaff_intra_8bpp_c:           41.1 ( 1.00x)
h264_h_loop_filter_chroma_mbaff_intra_8bpp_rvv_i32:     72.6 ( 0.57x)
h264_h_loop_filter_luma_intra_8bpp_c:                  166.1 ( 1.00x)
h264_h_loop_filter_luma_intra_8bpp_rvv_i32:            395.4 ( 0.42x)
h264_h_loop_filter_luma_mbaff_intra_8bpp_c:             93.3 ( 1.00x)
h264_h_loop_filter_luma_mbaff_intra_8bpp_rvv_i32:      395.4 ( 0.24x)
h264_v_loop_filter_chroma_intra_8bpp_c:                134.8 ( 1.00x)
h264_v_loop_filter_chroma_intra_8bpp_rvv_i32:           51.6 ( 2.61x)
h264_v_loop_filter_luma_intra_8bpp_c:                  468.1 ( 1.00x)
h264_v_loop_filter_luma_intra_8bpp_rvv_i32:            134.8 ( 3.47x)
This commit is contained in:
Rémi Denis-Courmont 2024-12-03 21:44:45 +02:00
parent 16d4945e9a
commit bd226fdd74
2 changed files with 189 additions and 0 deletions

View File

@ -40,6 +40,12 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_v_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv(uint8_t *pix, ptrdiff_t s,
int a, int b);
void ff_h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@ -47,6 +53,13 @@ void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride,
void ff_h264_h_loop_filter_chroma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta,
int8_t *tc0);
void ff_h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta);
void ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *pix,
ptrdiff_t stride,
int alpha, int beta);
#define IDCT_DEPTH(depth) \
void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
@ -125,13 +138,26 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff =
ff_h264_h_loop_filter_luma_mbaff_8_rvv;
dsp->h264_v_loop_filter_luma_intra =
ff_h264_v_loop_filter_luma_intra_8_rvv;
dsp->h264_h_loop_filter_luma_intra =
ff_h264_h_loop_filter_luma_intra_8_rvv;
dsp->h264_h_loop_filter_luma_mbaff_intra =
ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv;
dsp->h264_v_loop_filter_chroma =
ff_h264_v_loop_filter_chroma_8_rvv;
dsp->h264_v_loop_filter_chroma_intra =
ff_h264_v_loop_filter_chroma_intra_8_rvv;
if (chroma_format_idc <= 1) {
dsp->h264_h_loop_filter_chroma =
ff_h264_h_loop_filter_chroma_8_rvv;
dsp->h264_h_loop_filter_chroma_mbaff =
ff_h264_h_loop_filter_chroma_mbaff_8_rvv;
dsp->h264_h_loop_filter_chroma_intra =
ff_h264_h_loop_filter_chroma_intra_8_rvv;
dsp->h264_h_loop_filter_chroma_mbaff_intra =
ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv;
}
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;

View File

@ -308,6 +308,125 @@ func ff_h264_v_loop_filter_\type\()_8_rvv, zve32x
vse8.v v11, (a0)
ret
endfunc
.variant_cc ff_h264_loop_filter_\type\()_intra_8_rvv
func ff_h264_loop_filter_\type\()_intra_8_rvv, zve32x
# p3: v8, p2: v9, p1: v10, p0: v11, q0: v12, q1: v13, q2: v14, q3: v15
# alpha: a2, beta: a3
csrwi vxrm, 0
srai a4, a2, 2
vwsubu.vv v16, v11, v12
addi a4, a4, 2
vwsubu.vv v18, v12, v11
vwsubu.vv v20, v10, v11
vwsubu.vv v22, v11, v10
vwsubu.vv v24, v13, v12
vwsubu.vv v26, v12, v13
vwsubu.vv v28, v11, v9
vwsubu.vv v30, v9, v11
vwsubu.vv v4, v14, v12
vwsubu.vv v6, v12, v14
vsetvli zero, zero, e16, \e16mul, ta, ma
vmax.vv v16, v16, v18 # abs(p0 - q0)
vmax.vv v20, v20, v22 # abs(p1 - p0)
vmslt.vx v18, v16, a2
vmax.vv v24, v24, v26 # abs(q1 - q0)
vmslt.vx v22, v20, a3
.ifc \type, luma
vmax.vv v28, v28, v30 # abs(p2 - p0)
.endif
vmand.mm v18, v18, v22
vmslt.vx v23, v24, a3
.ifc \type, luma
vmax.vv v4, v4, v6 # abs(q2 - q0)
vmand.mm v1, v18, v23 # abs(...) < A && abs(..) < B && abs(..) < B
vmslt.vx v3, v16, a4 # abs(p0 - q0) < (alpha / 4) + 2
vmslt.vx v6, v28, a3 # abs(p2 - p0) < beta
vmslt.vx v7, v4, a3 # abs(q2 - q0) < beta
vmand.mm v2, v3, v6
vmand.mm v3, v3, v7
.else
vmand.mm v0, v18, v23
.endif
vsetvli zero, zero, e8, \e8mul, ta, mu
vwaddu.vv v22, v11, v13
vwaddu.vv v30, v10, v12
vwaddu.wv v22, v22, v10
vwaddu.wv v30, v30, v13
vwaddu.wv v22, v22, v10 # 2p1 + p0 + q1
vwaddu.wv v30, v30, v13 # p1 + q0 + 2q1
.ifc \type, luma
vwaddu.vv v16, v10, v11
vwaddu.vv v20, v8, v9
vwaddu.wv v16, v16, v12 # p1 + p0 + q0
vwaddu.vv v24, v11, v12
vwaddu.vv v28, v14, v15
vwaddu.wv v24, v24, v13 # p0 + q0 + q1
vwaddu.wv v18, v16, v9 # p2 + p1 + p0 + q0
vwaddu.wv v16, v16, v13 # p1 + p0 + q0 + q1
vwaddu.wv v26, v24, v14 # p0 + q0 + q1 + q2
vwaddu.wv v24, v24, v10 # p1 + p0 + q0 + q1
vsetvli zero, zero, e16, \e16mul, ta, ma
vsll.vi v20, v20, 1 # 2p3 +2p2
vadd.vv v16, v16, v18 # p2 +2p1 +2p0 +2q0 + q1
vadd.vv v20, v18, v20 # 2p3 +3p3 + p1 + p0 + q0
vsll.vi v28, v28, 1 # 2q2 +2q3
vadd.vv v24, v24, v26 # p1 +2p0 +2q0 +2q1 + q2
vadd.vv v28, v26, v28 # p0 + q0 + q1 +3q2 +2q3
vsetvli zero, zero, e8, \e8mul, ta, mu
vmand.mm v0, v1, v2
vnclipu.wi v11, v16, 3, v0.t # p0'
vnclipu.wi v10, v18, 2, v0.t # p1'
vnclipu.wi v9, v20, 3, v0.t # p2'
vmandn.mm v0, v1, v2
.endif
vnclipu.wi v11, v22, 2, v0.t # p0'
.ifc \type, luma
vmand.mm v0, v1, v3
vnclipu.wi v12, v24, 3, v0.t # q0'
vnclipu.wi v13, v26, 2, v0.t # q1'
vnclipu.wi v14, v28, 3, v0.t # q2'
vmandn.mm v0, v1, v3
.endif
vnclipu.wi v12, v30, 2, v0.t # q0'
jr t0
endfunc
func ff_h264_v_loop_filter_\type\()_intra_8_rvv, zve32x
lpad 0
sub t3, a0, a1
vsetivli zero, 4 * \inners, e8, \e8mul, ta, ma
vle8.v v12, (a0)
sub t2, t3, a1
vle8.v v11, (t3)
add t4, a0, a1
vle8.v v10, (t2)
.ifc \type, luma
sub t1, t2, a1
.endif
vle8.v v13, (t4)
.ifc \type, luma
sub t0, t1, a1
vle8.v v9, (t1)
add t5, t4, a1
vle8.v v8, (t0)
add t6, t5, a1
vle8.v v14, (t5)
vle8.v v15, (t6)
.endif
jal t0, ff_h264_loop_filter_\type\()_intra_8_rvv
.ifc \type, luma
vse8.v v9, (t1)
vse8.v v10, (t2)
.endif
vse8.v v11, (t3)
vse8.v v12, (a0)
.ifc \type, luma
vse8.v v13, (t4)
vse8.v v14, (t5)
.endif
ret
endfunc
.endm
loop_filter luma, 4, m1, m2
@ -373,3 +492,47 @@ func ff_h264_h_loop_filter_chroma_mbaff_8_rvv, zve32x
vssseg2e8.v v10, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_luma_intra_8_rvv, zve32x
lpad 0
addi a0, a0, -4
vsetivli zero, 16, e8, m1, ta, ma
vlsseg8e8.v v8, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_luma_intra_8_rvv
vssseg6e8.v v9, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv, zve32x
lpad 0
addi a0, a0, -4
vsetivli zero, 8, e8, m1, ta, ma
vlsseg8e8.v v8, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_luma_intra_8_rvv
vssseg6e8.v v9, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_chroma_intra_8_rvv, zve32x
lpad 0
addi a0, a0, -2
vsetivli zero, 8, e8, mf2, ta, ma
vlsseg4e8.v v10, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_chroma_intra_8_rvv
vssseg2e8.v v11, (a0), a1
ret
endfunc
func ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv, zve32x
lpad 0
addi a0, a0, -2
vsetivli zero, 4, e8, mf2, ta, ma
vlsseg4e8.v v10, (a0), a1
addi a0, a0, 1
jal t0, ff_h264_loop_filter_chroma_intra_8_rvv
vssseg2e8.v v11, (a0), a1
ret
endfunc