avcodec/riscv: add h264 dc idct rvv

checkasm: bench runs 131072 (1 << 17)
h264_idct4_add_dc_8bpp_c: 1.5
h264_idct4_add_dc_8bpp_rvv_i64: 0.7
h264_idct4_add_dc_9bpp_c: 1.5
h264_idct4_add_dc_9bpp_rvv_i64: 0.7
h264_idct4_add_dc_10bpp_c: 1.5
h264_idct4_add_dc_10bpp_rvv_i64: 0.7
h264_idct4_add_dc_12bpp_c: 1.2
h264_idct4_add_dc_12bpp_rvv_i64: 0.7
h264_idct4_add_dc_14bpp_c: 1.2
h264_idct4_add_dc_14bpp_rvv_i64: 0.7
h264_idct8_add_dc_8bpp_c: 5.2
h264_idct8_add_dc_8bpp_rvv_i64: 1.5
h264_idct8_add_dc_9bpp_c: 5.5
h264_idct8_add_dc_9bpp_rvv_i64: 1.2
h264_idct8_add_dc_10bpp_c: 5.5
h264_idct8_add_dc_10bpp_rvv_i64: 1.2
h264_idct8_add_dc_12bpp_c: 4.2
h264_idct8_add_dc_12bpp_rvv_i64: 1.2
h264_idct8_add_dc_14bpp_c: 4.2
h264_idct8_add_dc_14bpp_rvv_i64: 1.2

Signed-off-by: J. Dekker <jdek@itanimul.li>
This commit is contained in:
J. Dekker 2024-06-10 12:43:44 +02:00
parent b3aeef3bf9
commit fa5a605542
2 changed files with 145 additions and 14 deletions

View File

@ -1,4 +1,5 @@
/*
* Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* This file is part of FFmpeg.
@ -68,6 +69,16 @@ void ff_h264_add_pixels4_16_rvv(uint8_t *dst, int16_t *block, int stride);
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
void ff_h264_idct4_dc_add_8_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_8_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_9_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_9_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_10_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_10_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_12_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_12_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_14_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_14_rvv(uint8_t *, int16_t *, int);
av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const int chroma_format_idc)
@ -94,36 +105,51 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_h_loop_filter_luma_mbaff =
ff_h264_h_loop_filter_luma_mbaff_8_rvv;
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;
# if __riscv_xlen == 64
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
# endif
if (flags & AV_CPU_FLAG_RVV_I64)
if (flags & AV_CPU_FLAG_RVV_I32)
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
}
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
}
if (bit_depth == 9) {
if (zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
if (zvl128b) {
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv;
}
if (bit_depth == 10) {
if (zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
if (zvl128b) {
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv;
}
if (bit_depth == 12) {
if (zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
if (zvl128b) {
dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
}
if (bit_depth == 14) {
if (zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
if (zvl128b) {
dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv;
}
if (bit_depth > 8 && zvl128b) {

View File

@ -1,4 +1,7 @@
/*
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2024 J. Dekker <jdek@itanimul.li>
* Copyright © 2024 Rémi Denis-Courmont.
*
* Redistribution and use in source and binary forms, with or without
@ -323,3 +326,105 @@ func ff_h264_h_loop_filter_luma_mbaff_8_rvv, zve32x
vssseg6e8.v v8, (a0), a1
ret
endfunc
.macro idct_dc_add8 width
func ff_h264_idct\width\()_dc_add_8_rvv, zve64x, zba
.if \width == 8
vsetivli zero, \width, e16, m1, ta, ma
.else
vsetivli zero, \width, e16, mf2, ta, ma
.endif
lh a3, 0(a1)
addi a3, a3, 32
srai a3, a3, 6
sh zero, 0(a1)
.if \width == 8
vlse64.v v24, (a0), a2
vsetvli t0, zero, e16, m8, ta, ma
.else
vlse32.v v24, (a0), a2
vsetvli t0, zero, e16, m4, ta, ma
.endif
vzext.vf2 v0, v24
vadd.vx v0, v0, a3
vmax.vx v0, v0, zero
.if \width == 8
vsetvli zero, zero, e8, m4, ta, ma
.else
vsetvli zero, zero, e8, m2, ta, ma
.endif
vnclipu.wi v24, v0, 0
vsetivli zero, \width, e8, m1, ta, ma
.if \width == 8
vsse64.v v24, (a0), a2
.else
vsse32.v v24, (a0), a2
.endif
ret
endfunc
.endm
idct_dc_add8 4
idct_dc_add8 8
.macro idct_dc_add width
func ff_h264_idct\width\()_dc_add_16_rvv, zve64x, zba
vsetivli zero, \width, e16, m1, ta, ma
lw a3, 0(a1)
addi a3, a3, 32
srai a3, a3, 6
sw zero, 0(a1)
add t4, a0, a2
sh1add t5, a2, a0
sh1add t6, a2, t4
.if \width == 8
sh2add t0, a2, a0
sh2add t1, a2, t4
sh2add t2, a2, t5
sh2add t3, a2, t6
.endif
vle16.v v0, (a0)
vle16.v v1, (t4)
vle16.v v2, (t5)
vle16.v v3, (t6)
.if \width == 8
vle16.v v4, (t0)
vle16.v v5, (t1)
vle16.v v6, (t2)
vle16.v v7, (t3)
vsetvli a6, zero, e16, m8, ta, ma
.else
vsetvli a6, zero, e16, m4, ta, ma
.endif
vadd.vx v0, v0, a3
vmax.vx v0, v0, zero
vmin.vx v0, v0, a5
vsetivli zero, \width, e16, m1, ta, ma
vse16.v v0, (a0)
vse16.v v1, (t4)
vse16.v v2, (t5)
vse16.v v3, (t6)
.if \width == 8
vse16.v v4, (t0)
vse16.v v5, (t1)
vse16.v v6, (t2)
vse16.v v7, (t3)
.endif
ret
endfunc
.endm
idct_dc_add 4
idct_dc_add 8
.irp depth,9,10,12,14
func ff_h264_idct4_dc_add_\depth\()_rvv, zve64x
li a5, (1 << \depth) - 1
j ff_h264_idct4_dc_add_16_rvv
endfunc
func ff_h264_idct8_dc_add_\depth\()_rvv, zve64x
li a5, (1 << \depth) - 1
j ff_h264_idct8_dc_add_16_rvv
endfunc
.endr