lavc/vp7dsp: add R-V V vp7_luma_dc_wht

This works out a bit more favourably than VP8's due to:
- additional multiplications that can be vectored,
- hardware-supported fixed-point rounding mode.

vp7_luma_dc_wht_c:       3.2
vp7_luma_dc_wht_rvv_i64: 2.0
This commit is contained in:
Rémi Denis-Courmont 2024-05-26 12:26:01 +03:00
parent 91b5ea7bb9
commit fd39997f72
5 changed files with 144 additions and 0 deletions

View File

@ -65,6 +65,8 @@ RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
OBJS-$(CONFIG_VP7_DECODER) += riscv/vp7dsp_init.o
RVV-OBJS-$(CONFIG_VP7_DECODER) += riscv/vp7dsp_rvv.o
OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/riscv/cpu.h"
#include "libavcodec/vp8dsp.h"
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
{
#if HAVE_RVV
int flags = av_get_cpu_flags();
if ((flags & AV_CPU_FLAG_RVV_I32) && ff_rv_vlen_least(128)) {
#if __riscv_xlen >= 64
c->vp8_luma_dc_wht = ff_vp7_luma_dc_wht_rvv;
#endif
}
#endif
}

View File

@ -0,0 +1,95 @@
/*
* Copyright (c) 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/riscv/asm.S"
#if __riscv_xlen >= 64
func ff_vp7_luma_dc_wht_rvv, zve32x
csrwi vxrm, 0
li t4, 12540
vsetivli zero, 4, e16, mf2, ta, ma
vlseg4e16.v v0, (a1)
li t6, 30274
vwmul.vx v8, v1, t4
li t5, 23170
vwmul.vx v9, v3, t6
addi t1, sp, -12 * 2
vwmul.vx v10, v1, t6
addi t2, sp, -8 * 2
vwmul.vx v11, v3, t4
addi t3, sp, -4 * 2
vwadd.vv v4, v0, v2
addi sp, sp, -16 * 2
vwsub.vv v5, v0, v2
vsetvli zero, zero, e32, m1, ta, ma
vadd.vv v7, v10, v11
vmul.vx v4, v4, t5
vsub.vv v6, v8, v9
vmul.vx v5, v5, t5
vadd.vv v0, v4, v7
vsub.vv v3, v4, v7
vadd.vv v1, v5, v6
vsub.vv v2, v5, v6
vsetvli zero, zero, e16, mf2, ta, ma
vnsra.wi v4, v0, 14
vnsra.wi v7, v3, 14
vnsra.wi v5, v1, 14
vnsra.wi v6, v2, 14
vsseg4e16.v v4, (sp)
vle16.v v0, (sp)
vle16.v v1, (t1)
vle16.v v2, (t2)
vle16.v v3, (t3)
vwmul.vx v8, v1, t4
li t0, 16 * 2
vwmul.vx v9, v3, t6
addi t1, a0, 1 * 4 * 16 * 2
vwmul.vx v10, v1, t6
addi t2, a0, 2 * 4 * 16 * 2
vwmul.vx v11, v3, t4
addi t3, a0, 3 * 4 * 16 * 2
vwadd.vv v4, v0, v2
vwsub.vv v5, v0, v2
vsetvli zero, zero, e32, m1, ta, ma
vmul.vx v4, v4, t5
sd zero, (a1)
vadd.vv v7, v10, v11
sd zero, 8(a1)
vmul.vx v5, v5, t5
sd zero, 16(a1)
vsub.vv v6, v8, v9
sd zero, 24(a1)
vadd.vv v0, v4, v7
addi sp, sp, 16 * 2
vsub.vv v3, v4, v7
vadd.vv v1, v5, v6
vsub.vv v2, v5, v6
vsetvli zero, zero, e16, mf2, ta, ma
vnclip.wi v4, v0, 18
vnclip.wi v5, v1, 18
vnclip.wi v6, v2, 18
vnclip.wi v7, v3, 18
vsse16.v v4, (a0), t0
vsse16.v v5, (t1), t0
vsse16.v v6, (t2), t0
vsse16.v v7, (t3), t0
ret
endfunc
#endif

View File

@ -712,6 +712,10 @@ av_cold void ff_vp7dsp_init(VP8DSPContext *dsp)
dsp->vp8_v_loop_filter_simple = vp7_v_loop_filter_simple_c;
dsp->vp8_h_loop_filter_simple = vp7_h_loop_filter_simple_c;
#if ARCH_RISCV
ff_vp7dsp_init_riscv(dsp);
#endif
}
#endif /* CONFIG_VP7_DECODER */

View File

@ -90,6 +90,8 @@ void ff_vp78dsp_init_ppc(VP8DSPContext *c);
void ff_vp78dsp_init_riscv(VP8DSPContext *c);
void ff_vp78dsp_init_x86(VP8DSPContext *c);
void ff_vp7dsp_init_riscv(VP8DSPContext *c);
void ff_vp8dsp_init(VP8DSPContext *c);
void ff_vp8dsp_init_aarch64(VP8DSPContext *c);
void ff_vp8dsp_init_arm(VP8DSPContext *c);