From bd226fdd743c302d76dd99f00d38eb00a9044a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Tue, 3 Dec 2024 21:44:45 +0200 Subject: [PATCH] lavc/h264dsp: R-V V intra loop filter As with the inter loop filter, performance metrics seem to be biased in favour of the C implementation because checkasm inputs almost always fall in the no-op case. h264_h_loop_filter_chroma_intra_8bpp_c: 82.8 ( 1.00x) h264_h_loop_filter_chroma_intra_8bpp_rvv_i32: 72.6 ( 1.14x) h264_h_loop_filter_chroma_mbaff_intra_8bpp_c: 41.1 ( 1.00x) h264_h_loop_filter_chroma_mbaff_intra_8bpp_rvv_i32: 72.6 ( 0.57x) h264_h_loop_filter_luma_intra_8bpp_c: 166.1 ( 1.00x) h264_h_loop_filter_luma_intra_8bpp_rvv_i32: 395.4 ( 0.42x) h264_h_loop_filter_luma_mbaff_intra_8bpp_c: 93.3 ( 1.00x) h264_h_loop_filter_luma_mbaff_intra_8bpp_rvv_i32: 395.4 ( 0.24x) h264_v_loop_filter_chroma_intra_8bpp_c: 134.8 ( 1.00x) h264_v_loop_filter_chroma_intra_8bpp_rvv_i32: 51.6 ( 2.61x) h264_v_loop_filter_luma_intra_8bpp_c: 468.1 ( 1.00x) h264_v_loop_filter_luma_intra_8bpp_rvv_i32: 134.8 ( 3.47x) --- libavcodec/riscv/h264dsp_init.c | 26 +++++ libavcodec/riscv/h264dsp_rvv.S | 163 ++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 30dd272d6e..f214486bbe 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -40,6 +40,12 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_luma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv(uint8_t *pix, ptrdiff_t s, + int a, int b); void ff_h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, @@ -47,6 +53,13 @@ void ff_h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, void ff_h264_h_loop_filter_chroma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +void ff_h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +void ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *pix, + ptrdiff_t stride, + int alpha, int beta); #define IDCT_DEPTH(depth) \ void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \ @@ -125,13 +138,26 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv; dsp->h264_h_loop_filter_luma_mbaff = ff_h264_h_loop_filter_luma_mbaff_8_rvv; + dsp->h264_v_loop_filter_luma_intra = + ff_h264_v_loop_filter_luma_intra_8_rvv; + dsp->h264_h_loop_filter_luma_intra = + ff_h264_h_loop_filter_luma_intra_8_rvv; + dsp->h264_h_loop_filter_luma_mbaff_intra = + ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv; dsp->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_8_rvv; + dsp->h264_v_loop_filter_chroma_intra = + ff_h264_v_loop_filter_chroma_intra_8_rvv; + if (chroma_format_idc <= 1) { dsp->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_8_rvv; dsp->h264_h_loop_filter_chroma_mbaff = ff_h264_h_loop_filter_chroma_mbaff_8_rvv; + dsp->h264_h_loop_filter_chroma_intra = + ff_h264_h_loop_filter_chroma_intra_8_rvv; + dsp->h264_h_loop_filter_chroma_mbaff_intra = + ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv; } dsp->h264_idct_add = ff_h264_idct_add_8_rvv; diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S index 5423b716ca..60015a7020 100644 --- a/libavcodec/riscv/h264dsp_rvv.S +++ b/libavcodec/riscv/h264dsp_rvv.S @@ -308,6 +308,125 @@ func ff_h264_v_loop_filter_\type\()_8_rvv, zve32x vse8.v v11, (a0) ret endfunc + + .variant_cc ff_h264_loop_filter_\type\()_intra_8_rvv +func ff_h264_loop_filter_\type\()_intra_8_rvv, zve32x + # p3: v8, p2: v9, p1: v10, p0: v11, q0: v12, q1: v13, q2: v14, q3: v15 + # alpha: a2, beta: a3 + csrwi vxrm, 0 + srai a4, a2, 2 + vwsubu.vv v16, v11, v12 + addi a4, a4, 2 + vwsubu.vv v18, v12, v11 + vwsubu.vv v20, v10, v11 + vwsubu.vv v22, v11, v10 + vwsubu.vv v24, v13, v12 + vwsubu.vv v26, v12, v13 + vwsubu.vv v28, v11, v9 + vwsubu.vv v30, v9, v11 + vwsubu.vv v4, v14, v12 + vwsubu.vv v6, v12, v14 + vsetvli zero, zero, e16, \e16mul, ta, ma + vmax.vv v16, v16, v18 # abs(p0 - q0) + vmax.vv v20, v20, v22 # abs(p1 - p0) + vmslt.vx v18, v16, a2 + vmax.vv v24, v24, v26 # abs(q1 - q0) + vmslt.vx v22, v20, a3 +.ifc \type, luma + vmax.vv v28, v28, v30 # abs(p2 - p0) +.endif + vmand.mm v18, v18, v22 + vmslt.vx v23, v24, a3 +.ifc \type, luma + vmax.vv v4, v4, v6 # abs(q2 - q0) + vmand.mm v1, v18, v23 # abs(...) < A && abs(..) < B && abs(..) < B + vmslt.vx v3, v16, a4 # abs(p0 - q0) < (alpha / 4) + 2 + vmslt.vx v6, v28, a3 # abs(p2 - p0) < beta + vmslt.vx v7, v4, a3 # abs(q2 - q0) < beta + vmand.mm v2, v3, v6 + vmand.mm v3, v3, v7 +.else + vmand.mm v0, v18, v23 +.endif + vsetvli zero, zero, e8, \e8mul, ta, mu + vwaddu.vv v22, v11, v13 + vwaddu.vv v30, v10, v12 + vwaddu.wv v22, v22, v10 + vwaddu.wv v30, v30, v13 + vwaddu.wv v22, v22, v10 # 2p1 + p0 + q1 + vwaddu.wv v30, v30, v13 # p1 + q0 + 2q1 +.ifc \type, luma + vwaddu.vv v16, v10, v11 + vwaddu.vv v20, v8, v9 + vwaddu.wv v16, v16, v12 # p1 + p0 + q0 + vwaddu.vv v24, v11, v12 + vwaddu.vv v28, v14, v15 + vwaddu.wv v24, v24, v13 # p0 + q0 + q1 + vwaddu.wv v18, v16, v9 # p2 + p1 + p0 + q0 + vwaddu.wv v16, v16, v13 # p1 + p0 + q0 + q1 + vwaddu.wv v26, v24, v14 # p0 + q0 + q1 + q2 + vwaddu.wv v24, v24, v10 # p1 + p0 + q0 + q1 + vsetvli zero, zero, e16, \e16mul, ta, ma + vsll.vi v20, v20, 1 # 2p3 +2p2 + vadd.vv v16, v16, v18 # p2 +2p1 +2p0 +2q0 + q1 + vadd.vv v20, v18, v20 # 2p3 +3p3 + p1 + p0 + q0 + vsll.vi v28, v28, 1 # 2q2 +2q3 + vadd.vv v24, v24, v26 # p1 +2p0 +2q0 +2q1 + q2 + vadd.vv v28, v26, v28 # p0 + q0 + q1 +3q2 +2q3 + vsetvli zero, zero, e8, \e8mul, ta, mu + vmand.mm v0, v1, v2 + vnclipu.wi v11, v16, 3, v0.t # p0' + vnclipu.wi v10, v18, 2, v0.t # p1' + vnclipu.wi v9, v20, 3, v0.t # p2' + vmandn.mm v0, v1, v2 +.endif + vnclipu.wi v11, v22, 2, v0.t # p0' +.ifc \type, luma + vmand.mm v0, v1, v3 + vnclipu.wi v12, v24, 3, v0.t # q0' + vnclipu.wi v13, v26, 2, v0.t # q1' + vnclipu.wi v14, v28, 3, v0.t # q2' + vmandn.mm v0, v1, v3 +.endif + vnclipu.wi v12, v30, 2, v0.t # q0' + jr t0 +endfunc + +func ff_h264_v_loop_filter_\type\()_intra_8_rvv, zve32x + lpad 0 + sub t3, a0, a1 + vsetivli zero, 4 * \inners, e8, \e8mul, ta, ma + vle8.v v12, (a0) + sub t2, t3, a1 + vle8.v v11, (t3) + add t4, a0, a1 + vle8.v v10, (t2) +.ifc \type, luma + sub t1, t2, a1 +.endif + vle8.v v13, (t4) +.ifc \type, luma + sub t0, t1, a1 + vle8.v v9, (t1) + add t5, t4, a1 + vle8.v v8, (t0) + add t6, t5, a1 + vle8.v v14, (t5) + vle8.v v15, (t6) +.endif + jal t0, ff_h264_loop_filter_\type\()_intra_8_rvv +.ifc \type, luma + vse8.v v9, (t1) + vse8.v v10, (t2) +.endif + vse8.v v11, (t3) + vse8.v v12, (a0) +.ifc \type, luma + vse8.v v13, (t4) + vse8.v v14, (t5) +.endif + ret +endfunc .endm loop_filter luma, 4, m1, m2 @@ -373,3 +492,47 @@ func ff_h264_h_loop_filter_chroma_mbaff_8_rvv, zve32x vssseg2e8.v v10, (a0), a1 ret endfunc + +func ff_h264_h_loop_filter_luma_intra_8_rvv, zve32x + lpad 0 + addi a0, a0, -4 + vsetivli zero, 16, e8, m1, ta, ma + vlsseg8e8.v v8, (a0), a1 + addi a0, a0, 1 + jal t0, ff_h264_loop_filter_luma_intra_8_rvv + vssseg6e8.v v9, (a0), a1 + ret +endfunc + +func ff_h264_h_loop_filter_luma_mbaff_intra_8_rvv, zve32x + lpad 0 + addi a0, a0, -4 + vsetivli zero, 8, e8, m1, ta, ma + vlsseg8e8.v v8, (a0), a1 + addi a0, a0, 1 + jal t0, ff_h264_loop_filter_luma_intra_8_rvv + vssseg6e8.v v9, (a0), a1 + ret +endfunc + +func ff_h264_h_loop_filter_chroma_intra_8_rvv, zve32x + lpad 0 + addi a0, a0, -2 + vsetivli zero, 8, e8, mf2, ta, ma + vlsseg4e8.v v10, (a0), a1 + addi a0, a0, 1 + jal t0, ff_h264_loop_filter_chroma_intra_8_rvv + vssseg2e8.v v11, (a0), a1 + ret +endfunc + +func ff_h264_h_loop_filter_chroma_mbaff_intra_8_rvv, zve32x + lpad 0 + addi a0, a0, -2 + vsetivli zero, 4, e8, mf2, ta, ma + vlsseg4e8.v v10, (a0), a1 + addi a0, a0, 1 + jal t0, ff_h264_loop_filter_chroma_intra_8_rvv + vssseg2e8.v v11, (a0), a1 + ret +endfunc