ffmpeg/libavcodec/riscv/h263dsp_rvv.S

/*
 * Copyright © 2024 Rémi Denis-Courmont.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/riscv/asm.S"

        .option push
        .option norelax
func ff_h263_h_loop_filter_rvv, zve32x
        lpad    0
        addi        a0, a0, -2
        vsetivli    zero, 8, e8, mf2, ta, ma
        vlsseg4e8.v v8, (a0), a1
        jal         t0, 1f
        vssseg4e8.v v8, (a0), a1
        ret
1:
        auipc       t1, %pcrel_hi(ff_h263_loop_filter_strength)
        vwsubu.vv   v14, v10, v9       # p2 - p1
        add         t1, t1, a2
        vwsubu.vv   v12, v8, v11       # p0 - p3
        vsetvli     zero, zero, e16, m1, ta, mu
        vsll.vi     v14, v14, 2
        lbu         t1, %pcrel_lo(1b)(t1) # strength
        vadd.vv     v16, v12, v14
        # Divide by 8 toward 0. v16 is a signed 10-bit value at this point.
        vsrl.vi     v18, v16, 16 - 3   # v18 = (v16 < 0) ? 7 : 0
        slli        t2, t1, 1          # 2 * strength
        vadd.vv     v16, v16, v18
        # v16 (d) is signed 7-bit, but later arithmetics require 9 bits.
        vsra.vi     v16, v16, 3        # d
        vmv.v.x     v20, t2
        vmslt.vi    v0, v16, 0
        vneg.v      v18, v16
        vneg.v      v20, v20, v0.t     # sign(d) * 2 * strength
        vmax.vv     v18, v16, v18      # |d|
        vsub.vv     v20, v20, v16      # d1 if strength <= |d| <= 2 * strength
        vmsge.vx    v0, v18, t2
        vsrl.vi     v14, v12, 16 - 2   # v14 = (v12 < 0) ? 3 : 0
        vmerge.vxm  v20, v20, zero, v0 # d1 if strength <= |d|
        vadd.vv     v12, v12, v14
        vmsge.vx    v0, v18, t1
        vsra.vi     v12, v12, 2        # (p0 - p3) / 4
        vmerge.vvm  v16, v16, v20, v0  # d1
        vzext.vf2   v24, v8     # p0 as u16 (because vwrsubu.wv does not exist)
        vneg.v      v14, v16
        vzext.vf2   v26, v9            # p1 as u16
        vmax.vv     v14, v16, v14      # |d1|
        vzext.vf2   v28, v10           # p2 as u16
        vsra.vi     v14, v14, 1        # ad1
        vadd.vv     v26, v26, v16      # p1 + d1
        vneg.v      v18, v14           # -ad1
        vmin.vv     v12, v12, v14
        vsub.vv     v28, v28, v16      # p2 - d1
        vmax.vv     v12, v12, v18      # d2
        vmax.vx     v26, v26, zero
        vsub.vv     v24, v24, v12      # p0 - d2
        vmax.vx     v28, v28, zero
        vsetvli     zero, zero, e8, mf2, ta, ma
        vwaddu.wv   v30, v12, v11      # p3 + d2
        vncvt.x.x.w v8, v24
        vnclipu.wi  v9, v26, 0
        vnclipu.wi  v10, v28, 0
        vncvt.x.x.w v11, v30
        jr          t0
endfunc
        .option pop

func ff_h263_v_loop_filter_rvv, zve32x
        lpad    0
        sub         a4, a0, a1
        vsetivli    zero, 8, e8, mf2, ta, ma
        vle8.v      v10, (a0)
        sub         a3, a4, a1
        vle8.v      v9, (a4)
        add         a5, a0, a1
        vle8.v      v8, (a3)
        vle8.v      v11, (a5)
        jal         t0, 1b
        vse8.v      v8, (a3)
        vse8.v      v9, (a4)
        vse8.v      v10, (a0)
        vse8.v      v11, (a5)
        ret
endfunc
lavc/h263dsp: R-V V {h,v}_loop_filter Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.) 2024-05-19 07:03:29 +00:00			`/*`
			`* Copyright © 2024 Rémi Denis-Courmont.`
			`*`
			`* This file is part of FFmpeg.`
			`*`
			`* FFmpeg is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2.1 of the License, or (at your option) any later version.`
			`*`
			`* FFmpeg is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with FFmpeg; if not, write to the Free Software`
			`* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
			`*/`

			`#include "libavutil/riscv/asm.S"`

			`.option push`
			`.option norelax`
			`func ff_h263_h_loop_filter_rvv, zve32x`
lavc/riscv: add forward-edge CFI landing pads 2024-07-22 19:17:40 +00:00			`lpad 0`
lavc/h263dsp: R-V V {h,v}_loop_filter Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.) 2024-05-19 07:03:29 +00:00			`addi a0, a0, -2`
			`vsetivli zero, 8, e8, mf2, ta, ma`
			`vlsseg4e8.v v8, (a0), a1`
			`jal t0, 1f`
			`vssseg4e8.v v8, (a0), a1`
			`ret`
			`1:`
lavc/riscv: don't set vxrm if unnecessary While narrowing clip is nominally a rounding operation, the rounding mode has no arithmetic consequence if the right shift is by zero bits. 2024-07-25 14:33:12 +00:00			`auipc t1, %pcrel_hi(ff_h263_loop_filter_strength)`
lavc/h263dsp: R-V V {h,v}_loop_filter Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.) 2024-05-19 07:03:29 +00:00			`vwsubu.vv v14, v10, v9 # p2 - p1`
			`add t1, t1, a2`
			`vwsubu.vv v12, v8, v11 # p0 - p3`
			`vsetvli zero, zero, e16, m1, ta, mu`
			`vsll.vi v14, v14, 2`
lavc/riscv: don't set vxrm if unnecessary While narrowing clip is nominally a rounding operation, the rounding mode has no arithmetic consequence if the right shift is by zero bits. 2024-07-25 14:33:12 +00:00			`lbu t1, %pcrel_lo(1b)(t1) # strength`
lavc/h263dsp: R-V V {h,v}_loop_filter Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.) 2024-05-19 07:03:29 +00:00			`vadd.vv v16, v12, v14`
			`# Divide by 8 toward 0. v16 is a signed 10-bit value at this point.`
			`vsrl.vi v18, v16, 16 - 3 # v18 = (v16 < 0) ? 7 : 0`
			`slli t2, t1, 1 # 2 * strength`
			`vadd.vv v16, v16, v18`
			`# v16 (d) is signed 7-bit, but later arithmetics require 9 bits.`
			`vsra.vi v16, v16, 3 # d`
			`vmv.v.x v20, t2`
			`vmslt.vi v0, v16, 0`
			`vneg.v v18, v16`
			`vneg.v v20, v20, v0.t # sign(d) * 2 * strength`
			`vmax.vv v18, v16, v18 # \|d\|`
			`vsub.vv v20, v20, v16 # d1 if strength <= \|d\| <= 2 * strength`
			`vmsge.vx v0, v18, t2`
			`vsrl.vi v14, v12, 16 - 2 # v14 = (v12 < 0) ? 3 : 0`
			`vmerge.vxm v20, v20, zero, v0 # d1 if strength <= \|d\|`
			`vadd.vv v12, v12, v14`
			`vmsge.vx v0, v18, t1`
			`vsra.vi v12, v12, 2 # (p0 - p3) / 4`
			`vmerge.vvm v16, v16, v20, v0 # d1`
			`vzext.vf2 v24, v8 # p0 as u16 (because vwrsubu.wv does not exist)`
			`vneg.v v14, v16`
			`vzext.vf2 v26, v9 # p1 as u16`
			`vmax.vv v14, v16, v14 # \|d1\|`
			`vzext.vf2 v28, v10 # p2 as u16`
			`vsra.vi v14, v14, 1 # ad1`
			`vadd.vv v26, v26, v16 # p1 + d1`
			`vneg.v v18, v14 # -ad1`
			`vmin.vv v12, v12, v14`
			`vsub.vv v28, v28, v16 # p2 - d1`
			`vmax.vv v12, v12, v18 # d2`
			`vmax.vx v26, v26, zero`
			`vsub.vv v24, v24, v12 # p0 - d2`
			`vmax.vx v28, v28, zero`
			`vsetvli zero, zero, e8, mf2, ta, ma`
			`vwaddu.wv v30, v12, v11 # p3 + d2`
			`vncvt.x.x.w v8, v24`
			`vnclipu.wi v9, v26, 0`
			`vnclipu.wi v10, v28, 0`
			`vncvt.x.x.w v11, v30`
			`jr t0`
			`endfunc`
			`.option pop`

			`func ff_h263_v_loop_filter_rvv, zve32x`
lavc/riscv: add forward-edge CFI landing pads 2024-07-22 19:17:40 +00:00			`lpad 0`
lavc/h263dsp: R-V V {h,v}_loop_filter Since the horizontal and vertical filters are identical except for a transposition, this uses a common subprocedure with an ad-hoc ABI. To preserve return-address stack prediction, a link register has to be used (c.f. the "Control Transfer Instructions" from the RISC-V ISA Manual). The alternate/temporary link register T0 is used here, so that the normal RA is preserved (something Arm cannot do!). To load the strength value based on `qscale`, the shortest possible and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic LLA; ADD; LBU sequence would add one more instruction since LLA is a convenience alias for AUIPC; ADDI. To ensure that this trick works, relocation relaxation is disabled. To implement the two signed divisions by a power of two toward zero: (x / (1 << SHIFT)) the code relies on the small range of integers involved, computing: (x + (x >> (16 - SHIFT))) >> SHIFT rather than the more general: (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT Thus one ANDI instruction is avoided. T-Head C908: h263dsp.h_loop_filter_c: 228.2 h263dsp.h_loop_filter_rvv_i32: 144.0 h263dsp.v_loop_filter_c: 242.7 h263dsp.v_loop_filter_rvv_i32: 114.0 (C is probably worse in real use due to less predictible branches.) 2024-05-19 07:03:29 +00:00			`sub a4, a0, a1`
			`vsetivli zero, 8, e8, mf2, ta, ma`
			`vle8.v v10, (a0)`
			`sub a3, a4, a1`
			`vle8.v v9, (a4)`
			`add a5, a0, a1`
			`vle8.v v8, (a3)`
			`vle8.v v11, (a5)`
			`jal t0, 1b`
			`vse8.v v8, (a3)`
			`vse8.v v9, (a4)`
			`vse8.v v10, (a0)`
			`vse8.v v11, (a5)`
			`ret`
			`endfunc`