/*
 * Copyright © 2024 Rémi Denis-Courmont.
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/riscv/asm.S"

        .option push
        .option norelax
func ff_h263_h_loop_filter_rvv, zve32x
        addi        a0, a0, -2
        vsetivli    zero, 8, e8, mf2, ta, ma
        vlsseg4e8.v v8, (a0), a1
        jal         t0, 1f
        vssseg4e8.v v8, (a0), a1
        ret
1:
        csrwi       vxrm, 0
2:      auipc       t1, %pcrel_hi(ff_h263_loop_filter_strength)
        vwsubu.vv   v14, v10, v9       # p2 - p1
        add         t1, t1, a2
        vwsubu.vv   v12, v8, v11       # p0 - p3
        vsetvli     zero, zero, e16, m1, ta, mu
        vsll.vi     v14, v14, 2
        lbu         t1, %pcrel_lo(2b)(t1) # strength
        vadd.vv     v16, v12, v14
        # Divide by 8 toward 0. v16 is a signed 10-bit value at this point.
        vsrl.vi     v18, v16, 16 - 3   # v18 = (v16 < 0) ? 7 : 0
        slli        t2, t1, 1          # 2 * strength
        vadd.vv     v16, v16, v18
        # v16 (d) is signed 7-bit, but later arithmetics require 9 bits.
        vsra.vi     v16, v16, 3        # d
        vmv.v.x     v20, t2
        vmslt.vi    v0, v16, 0
        vneg.v      v18, v16
        vneg.v      v20, v20, v0.t     # sign(d) * 2 * strength
        vmax.vv     v18, v16, v18      # |d|
        vsub.vv     v20, v20, v16      # d1 if strength <= |d| <= 2 * strength
        vmsge.vx    v0, v18, t2
        vsrl.vi     v14, v12, 16 - 2   # v14 = (v12 < 0) ? 3 : 0
        vmerge.vxm  v20, v20, zero, v0 # d1 if strength <= |d|
        vadd.vv     v12, v12, v14
        vmsge.vx    v0, v18, t1
        vsra.vi     v12, v12, 2        # (p0 - p3) / 4
        vmerge.vvm  v16, v16, v20, v0  # d1
        vzext.vf2   v24, v8     # p0 as u16 (because vwrsubu.wv does not exist)
        vneg.v      v14, v16
        vzext.vf2   v26, v9            # p1 as u16
        vmax.vv     v14, v16, v14      # |d1|
        vzext.vf2   v28, v10           # p2 as u16
        vsra.vi     v14, v14, 1        # ad1
        vadd.vv     v26, v26, v16      # p1 + d1
        vneg.v      v18, v14           # -ad1
        vmin.vv     v12, v12, v14
        vsub.vv     v28, v28, v16      # p2 - d1
        vmax.vv     v12, v12, v18      # d2
        vmax.vx     v26, v26, zero
        vsub.vv     v24, v24, v12      # p0 - d2
        vmax.vx     v28, v28, zero
        vsetvli     zero, zero, e8, mf2, ta, ma
        vwaddu.wv   v30, v12, v11      # p3 + d2
        vncvt.x.x.w v8, v24
        vnclipu.wi  v9, v26, 0
        vnclipu.wi  v10, v28, 0
        vncvt.x.x.w v11, v30
        jr          t0
endfunc
        .option pop

func ff_h263_v_loop_filter_rvv, zve32x
        sub         a4, a0, a1
        vsetivli    zero, 8, e8, mf2, ta, ma
        vle8.v      v10, (a0)
        sub         a3, a4, a1
        vle8.v      v9, (a4)
        add         a5, a0, a1
        vle8.v      v8, (a3)
        vle8.v      v11, (a5)
        jal         t0, 1b
        vse8.v      v8, (a3)
        vse8.v      v9, (a4)
        vse8.v      v10, (a0)
        vse8.v      v11, (a5)
        ret
endfunc