ffmpeg/libavcodec/riscv/h263dsp_rvv.S
Rémi Denis-Courmont 910d281b21 lavc/h263dsp: R-V V {h,v}_loop_filter
Since the horizontal and vertical filters are identical except for a
transposition, this uses a common subprocedure with an ad-hoc ABI.
To preserve return-address stack prediction, a link register has to be
used (c.f. the "Control Transfer Instructions" from the
RISC-V ISA Manual). The alternate/temporary link register T0 is used
here, so that the normal RA is preserved (something Arm cannot do!).

To load the strength value based on `qscale`, the shortest possible
and PIC-compatible sequence is used: AUIPC; ADD; LBU. The classic
LLA; ADD; LBU sequence would add one more instruction since LLA is a
convenience alias for AUIPC; ADDI. To ensure that this trick works,
relocation relaxation is disabled.

To implement the two signed divisions by a power of two toward zero:
 (x / (1 << SHIFT))
the code relies on the small range of integers involved, computing:
 (x + (x >> (16 - SHIFT))) >> SHIFT
rather than the more general:
 (x + ((x >> (16 - 1)) & ((1 << SHIFT) - 1))) >> SHIFT
Thus one ANDI instruction is avoided.

T-Head C908:
h263dsp.h_loop_filter_c:       228.2
h263dsp.h_loop_filter_rvv_i32: 144.0
h263dsp.v_loop_filter_c:       242.7
h263dsp.v_loop_filter_rvv_i32: 114.0
(C is probably worse in real use due to less predictible branches.)
2024-05-22 19:15:39 +03:00

101 lines
3.7 KiB
ArmAsm

/*
* Copyright © 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/riscv/asm.S"
.option push
.option norelax
func ff_h263_h_loop_filter_rvv, zve32x
addi a0, a0, -2
vsetivli zero, 8, e8, mf2, ta, ma
vlsseg4e8.v v8, (a0), a1
jal t0, 1f
vssseg4e8.v v8, (a0), a1
ret
1:
csrwi vxrm, 0
2: auipc t1, %pcrel_hi(ff_h263_loop_filter_strength)
vwsubu.vv v14, v10, v9 # p2 - p1
add t1, t1, a2
vwsubu.vv v12, v8, v11 # p0 - p3
vsetvli zero, zero, e16, m1, ta, mu
vsll.vi v14, v14, 2
lbu t1, %pcrel_lo(2b)(t1) # strength
vadd.vv v16, v12, v14
# Divide by 8 toward 0. v16 is a signed 10-bit value at this point.
vsrl.vi v18, v16, 16 - 3 # v18 = (v16 < 0) ? 7 : 0
slli t2, t1, 1 # 2 * strength
vadd.vv v16, v16, v18
# v16 (d) is signed 7-bit, but later arithmetics require 9 bits.
vsra.vi v16, v16, 3 # d
vmv.v.x v20, t2
vmslt.vi v0, v16, 0
vneg.v v18, v16
vneg.v v20, v20, v0.t # sign(d) * 2 * strength
vmax.vv v18, v16, v18 # |d|
vsub.vv v20, v20, v16 # d1 if strength <= |d| <= 2 * strength
vmsge.vx v0, v18, t2
vsrl.vi v14, v12, 16 - 2 # v14 = (v12 < 0) ? 3 : 0
vmerge.vxm v20, v20, zero, v0 # d1 if strength <= |d|
vadd.vv v12, v12, v14
vmsge.vx v0, v18, t1
vsra.vi v12, v12, 2 # (p0 - p3) / 4
vmerge.vvm v16, v16, v20, v0 # d1
vzext.vf2 v24, v8 # p0 as u16 (because vwrsubu.wv does not exist)
vneg.v v14, v16
vzext.vf2 v26, v9 # p1 as u16
vmax.vv v14, v16, v14 # |d1|
vzext.vf2 v28, v10 # p2 as u16
vsra.vi v14, v14, 1 # ad1
vadd.vv v26, v26, v16 # p1 + d1
vneg.v v18, v14 # -ad1
vmin.vv v12, v12, v14
vsub.vv v28, v28, v16 # p2 - d1
vmax.vv v12, v12, v18 # d2
vmax.vx v26, v26, zero
vsub.vv v24, v24, v12 # p0 - d2
vmax.vx v28, v28, zero
vsetvli zero, zero, e8, mf2, ta, ma
vwaddu.wv v30, v12, v11 # p3 + d2
vncvt.x.x.w v8, v24
vnclipu.wi v9, v26, 0
vnclipu.wi v10, v28, 0
vncvt.x.x.w v11, v30
jr t0
endfunc
.option pop
func ff_h263_v_loop_filter_rvv, zve32x
sub a4, a0, a1
vsetivli zero, 8, e8, mf2, ta, ma
vle8.v v10, (a0)
sub a3, a4, a1
vle8.v v9, (a4)
add a5, a0, a1
vle8.v v8, (a3)
vle8.v v11, (a5)
jal t0, 1b
vse8.v v8, (a3)
vse8.v v9, (a4)
vse8.v v10, (a0)
vse8.v v11, (a5)
ret
endfunc