lavc/h264dsp: R-V V 8-bit h264_weight_pixels

There are two implementations here: - a generic scalable one processing two columns at a time, - a specialised processing one (fixed-size) row at a time. Unsurprisingly, the generic one works out better with smaller widths. With larger widths, the gains from filling vectors are outweighed by the extra cost of strided loads and stores. In other words, memory accesses become the bottleneck. T-Head C908: h264_weight2_8_c: 54.5 h264_weight2_8_rvv_i32: 13.7 h264_weight4_8_c: 101.7 h264_weight4_8_rvv_i32: 27.5 h264_weight8_8_c: 197.0 h264_weight8_8_rvv_i32: 75.5 h264_weight16_8_c: 385.0 h264_weight16_8_rvv_i32: 74.2 SpacemiT X60: h264_weight2_8_c: 48.5 h264_weight2_8_rvv_i32: 8.2 h264_weight4_8_c: 90.7 h264_weight4_8_rvv_i32: 16.5 h264_weight8_8_c: 175.0 h264_weight8_8_rvv_i32: 37.7 h264_weight16_8_c: 342.2 h264_weight16_8_rvv_i32: 66.0
2024-07-04 21:38:48 +03:00 · 2024-07-04 21:38:48 +03:00 · 3606e592ea
parent 85706f5136
commit 3606e592ea
2 changed files with 90 additions and 0 deletions
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@ -21,12 +21,15 @@
 #include "config.h"

 #include <stdint.h>
+#include <string.h>

 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
 #include "libavutil/riscv/cpu.h"
 #include "libavcodec/h264dsp.h"

+extern const h264_weight_func ff_h264_weight_funcs_8_rvv[];
+
 void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                      int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
@ -60,6 +63,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
 # if HAVE_RVV
    if (flags & AV_CPU_FLAG_RVV_I32) {
        if (bit_depth == 8 && ff_rv_vlen_least(128)) {
+            memcpy(dsp->weight_h264_pixels_tab,
+                   ff_h264_weight_funcs_8_rvv,
+                   sizeof (dsp->weight_h264_pixels_tab));
+
            dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
            dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
            dsp->h264_h_loop_filter_luma_mbaff =
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@ -26,6 +26,89 @@

 #include "libavutil/riscv/asm.S"

+func ff_h264_weight_pixels_simple_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a5, a5, a3
+1:
+        vsetvli zero, a6, e32, m4, ta, ma
+        vle8.v  v8, (a0)
+        addi    a2, a2, -1
+        vmv.v.x v16, a5
+        vsetvli zero, zero, e16, m2, ta, ma
+        vzext.vf2   v24, v8
+        vwmaccsu.vx v16, a4, v24
+        vnclip.wx   v16, v16, a3
+        vmax.vx v16, v16, zero
+        vsetvli zero, zero, e8, m1, ta, ma
+        vnclipu.wi  v8, v16, 0
+        vse8.v  v8, (a0)
+        add     a0, a0, a1
+        bnez    a2, 1b
+
+        ret
+endfunc
+
+func ff_h264_weight_pixels_8_rvv, zve32x
+        csrwi   vxrm, 0
+        sll     a5, a5, a3
+1:
+        mv      t0, a0
+        mv      t6, a6
+2:
+        vsetvli t2, a2, e32, m8, ta, ma
+        vlsseg2e8.v v0, (t0), a1
+        addi    t6, t6, -2
+        vmv.v.x v16, a5
+        vmv.v.x v24, a5
+        vsetvli zero, zero, e16, m4, ta, ma
+        vzext.vf2   v8, v0
+        vzext.vf2   v12, v2
+        vwmaccsu.vx v16, a4, v8
+        vwmaccsu.vx v24, a4, v12
+        vnclip.wx   v8, v16, a3
+        vnclip.wx   v12, v24, a3
+        vmax.vx v8, v8, zero
+        vmax.vx v12, v12, zero
+        vsetvli zero, zero, e8, m2, ta, ma
+        vnclipu.wi  v0, v8, 0
+        vnclipu.wi  v2, v12, 0
+        vssseg2e8.v v0, (t0), a1
+        addi    t0, t0, 2
+        bnez    t6, 2b
+
+        mul     t3, a1, t2
+        sub     a2, a2, t2
+        add     a0, a0, t3
+        bnez    a2, 1b
+
+        ret
+endfunc
+
+.irp    w, 16, 8, 4, 2
+func ff_h264_weight_pixels\w\()_8_rvv, zve32x
+        li      a6, \w
+        .if     \w == 16
+        j       ff_h264_weight_pixels_simple_8_rvv
+        .else
+        j       ff_h264_weight_pixels_8_rvv
+        .endif
+endfunc
+.endr
+
+        .global ff_h264_weight_funcs_8_rvv
+        .hidden ff_h264_weight_funcs_8_rvv
+const ff_h264_weight_funcs_8_rvv
+        .irp    w, 16, 8, 4, 2
+#if __riscv_xlen == 32
+        .word   ff_h264_weight_pixels\w\()_8_rvv
+#elif __riscv_xlen == 64
+        .dword  ff_h264_weight_pixels\w\()_8_rvv
+#else
+        .qword  ff_h264_weight_pixels\w\()_8_rvv
+#endif
+        .endr
+endconst
+
        .variant_cc ff_h264_loop_filter_luma_8_rvv
 func ff_h264_loop_filter_luma_8_rvv, zve32x
        # p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13