lavc/h264dsp: R-V V high-depth h264_idct_add

T-Head C908 (cycles):
h264_idct4_add_9bpp_c:        248.2
h264_idct4_add_9bpp_rvv_i32:  128.7
h264_idct4_add_10bpp_c:       256.7
h264_idct4_add_10bpp_rvv_i32: 128.7
h264_idct4_add_12bpp_c:       252.5
h264_idct4_add_12bpp_rvv_i32: 129.7
h264_idct4_add_14bpp_c:       258.0
h264_idct4_add_14bpp_rvv_i32: 129.7
This commit is contained in:
Rémi Denis-Courmont 2024-07-02 22:03:07 +03:00
parent d059ea5663
commit 4e0e872881
2 changed files with 81 additions and 1 deletions

View File

@ -52,6 +52,11 @@ void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset,
int16_t *block, int stride,
const uint8_t nnzc[5 * 8]);
void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
@ -65,7 +70,9 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
# if HAVE_RVV
if (flags & AV_CPU_FLAG_RVV_I32) {
if (bit_depth == 8 && ff_rv_vlen_least(128)) {
const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8 && zvl128b) {
for (int i = 0; i < 4; i++) {
dsp->weight_h264_pixels_tab[i] =
ff_h264_weight_funcs_8_rvv[i].weight;
@ -86,6 +93,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
# endif
}
if (bit_depth == 9 && zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
if (bit_depth == 10 && zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
if (bit_depth == 12 && zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
if (bit_depth == 14 && zvl128b)
dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
}
# endif

View File

@ -105,6 +105,69 @@ func ff_h264_idct_add_8_rvv, zve32x
ret
endfunc
func ff_h264_idct_add_16_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e32, m1, ta, ma
addi t1, a1, 1 * 4 * 4
vle32.v v0, (a1)
addi t2, a1, 2 * 4 * 4
vle32.v v1, (t1)
addi t3, a1, 3 * 4 * 4
vle32.v v2, (t2)
vle32.v v3, (t3)
jal t0, ff_h264_idct4_rvv
vse32.v v0, (a1)
vse32.v v1, (t1)
vse32.v v2, (t2)
vse32.v v3, (t3)
vlseg4e32.v v0, (a1)
.equ offset, 0
.rept 512 / __riscv_xlen
sx zero, offset(a1)
.equ offset, offset + (__riscv_xlen / 8)
.endr
jal t0, ff_h264_idct4_rvv
add t1, a0, a2
vle16.v v4, (a0)
add t2, t1, a2
vle16.v v5, (t1)
add t3, t2, a2
vle16.v v6, (t2)
vle16.v v7, (t3)
.irp n,0,1,2,3
vssra.vi v\n, v\n, 6
.endr
vsetvli zero, zero, e16, mf2, ta, ma
vwaddu.wv v0, v0, v4
vwaddu.wv v1, v1, v5
vwaddu.wv v2, v2, v6
vwaddu.wv v3, v3, v7
vsetvli zero, zero, e32, m1, ta, ma
.irp n,0,1,2,3
vmax.vx v\n, v\n, zero
.endr
.irp n,0,1,2,3
vmin.vx v\n, v\n, a3
.endr
vsetvli zero, zero, e16, mf2, ta, ma
vncvt.x.x.w v4, v0
vncvt.x.x.w v5, v1
vncvt.x.x.w v6, v2
vncvt.x.x.w v7, v3
vse16.v v4, (a0)
vse16.v v5, (t1)
vse16.v v6, (t2)
vse16.v v7, (t3)
ret
endfunc
.irp depth, 9, 10, 12, 14
func ff_h264_idct_add_\depth\()_rvv, zve32x
li a3, (1 << \depth) - 1
j ff_h264_idct_add_16_rvv
endfunc
.endr
.variant_cc ff_h264_idct8_rvv
func ff_h264_idct8_rvv, zve32x
vsra.vi v9, v7, 1