lavc/h264dsp: R-V V high-depth idct_add{,intra}16, idct8_add4

As with 8-bit, this tends to be faster, but results are all over the
place due to the variable distribution of non-zero coefficients.
This commit is contained in:
Rémi Denis-Courmont 2024-07-15 21:19:39 +03:00
parent 90672974bd
commit 483fd732ab
2 changed files with 160 additions and 91 deletions

View File

@ -41,26 +41,27 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride, void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0); int alpha, int beta, int8_t *tc0);
void ff_h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride); #define IDCT_DEPTH(depth) \
void ff_h264_idct8_add_8_rvv(uint8_t *dst, int16_t *block, int stride); void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
void ff_h264_idct_add16_8_rvv(uint8_t *dst, const int *blockoffset, void ff_h264_idct8_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
int16_t *block, int stride, void ff_h264_idct4_dc_add_##depth##_rvv(uint8_t *, int16_t *, int); \
const uint8_t nnzc[5 * 8]); void ff_h264_idct8_dc_add_##depth##_rvv(uint8_t *, int16_t *, int); \
void ff_h264_idct_add16intra_8_rvv(uint8_t *dst, const int *blockoffset, void ff_h264_idct_add16_##depth##_rvv(uint8_t *d, const int *soffset, \
int16_t *block, int stride, int16_t *s, int stride, \
const uint8_t nnzc[5 * 8]); const uint8_t nnzc[5 * 8]); \
void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset, void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
int16_t *block, int stride, int16_t *s, int stride, \
const uint8_t nnzc[5 * 8]); const uint8_t nnzc[5 * 8]); \
void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
int16_t *s, int stride, \
const uint8_t nnzc[5 * 8]);
void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride); IDCT_DEPTH(8)
void ff_h264_idct8_add_9_rvv(uint8_t *dst, int16_t *block, int stride); IDCT_DEPTH(9)
void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride); IDCT_DEPTH(10)
void ff_h264_idct8_add_10_rvv(uint8_t *dst, int16_t *block, int stride); IDCT_DEPTH(12)
void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride); IDCT_DEPTH(14)
void ff_h264_idct8_add_12_rvv(uint8_t *dst, int16_t *block, int stride); #undef IDCT_DEPTH
void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_idct8_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride); void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride); void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride);
@ -69,16 +70,6 @@ void ff_h264_add_pixels4_16_rvv(uint8_t *dst, int16_t *block, int stride);
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int); extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int); extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
void ff_h264_idct4_dc_add_8_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_8_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_9_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_9_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_10_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_10_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_12_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_12_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct4_dc_add_14_rvv(uint8_t *, int16_t *, int);
void ff_h264_idct8_dc_add_14_rvv(uint8_t *, int16_t *, int);
av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const int chroma_format_idc) const int chroma_format_idc)
@ -120,38 +111,31 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
} }
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv; dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
} }
if (bit_depth == 9) {
if (zvl128b) { #define IDCT_DEPTH(depth) \
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv; if (bit_depth == depth) { \
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv; if (zvl128b) \
dsp->h264_idct_add = ff_h264_idct_add_9_rvv; dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
} if (flags & AV_CPU_FLAG_RVB_ADDR) \
dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv; dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
} if (zvl128b && (flags & AV_CPU_FLAG_RVB_ADDR)) { \
if (bit_depth == 10) { dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \
if (zvl128b) { dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv; } \
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv; if (__riscv_xlen == 64 && zvl128b) { \
dsp->h264_idct_add = ff_h264_idct_add_10_rvv; dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
} dsp->h264_idct_add16intra = \
dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv; ff_h264_idct_add16intra_##depth##_rvv; \
} } \
if (bit_depth == 12) { if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
if (zvl128b) { dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
}
if (bit_depth == 14) {
if (zvl128b) {
dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
}
dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv;
} }
IDCT_DEPTH(9)
IDCT_DEPTH(10)
IDCT_DEPTH(12)
IDCT_DEPTH(14)
if (bit_depth > 8 && zvl128b) { if (bit_depth > 8 && zvl128b) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv; dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv;
if (flags & AV_CPU_FLAG_RVV_I64) if (flags & AV_CPU_FLAG_RVV_I64)

View File

@ -107,6 +107,7 @@ endfunc
func ff_h264_idct_add_16_rvv, zve32x func ff_h264_idct_add_16_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
.Lidct_add4_16_rvv:
vsetivli zero, 4, e32, m1, ta, ma vsetivli zero, 4, e32, m1, ta, ma
addi t1, a1, 1 * 4 * 4 addi t1, a1, 1 * 4 * 4
vle32.v v0, (a1) vle32.v v0, (a1)
@ -147,7 +148,7 @@ func ff_h264_idct_add_16_rvv, zve32x
vmax.vx v\n, v\n, zero vmax.vx v\n, v\n, zero
.endr .endr
.irp n,0,1,2,3 .irp n,0,1,2,3
vmin.vx v\n, v\n, a3 vmin.vx v\n, v\n, a5
.endr .endr
vsetvli zero, zero, e16, mf2, ta, ma vsetvli zero, zero, e16, mf2, ta, ma
vncvt.x.x.w v4, v0 vncvt.x.x.w v4, v0
@ -295,9 +296,10 @@ func ff_h264_idct8_add_8_rvv, zve32x
endfunc endfunc
func ff_h264_idct8_add_16_rvv, zve32x func ff_h264_idct8_add_16_rvv, zve32x
li a4, 8
csrwi vxrm, 0 csrwi vxrm, 0
vsetivli a5, 8, e32, m1, ta, ma .Lidct8_add_16_rvv:
li a4, 8
vsetivli a3, 8, e32, m1, ta, ma
1: 1:
addi t1, a1, 1 * 8 * 4 addi t1, a1, 1 * 8 * 4
vle32.v v0, (a1) vle32.v v0, (a1)
@ -313,11 +315,11 @@ func ff_h264_idct8_add_16_rvv, zve32x
vle32.v v5, (t5) vle32.v v5, (t5)
addi a7, a1, 7 * 8 * 4 addi a7, a1, 7 * 8 * 4
vle32.v v6, (t6) vle32.v v6, (t6)
sub a4, a4, a5 sub a4, a4, a3
vle32.v v7, (a7) vle32.v v7, (a7)
jal t0, ff_h264_idct8_rvv jal t0, ff_h264_idct8_rvv
vse32.v v0, (a1) vse32.v v0, (a1)
sh2add a1, a5, a1 sh2add a1, a3, a1
vse32.v v1, (t1) vse32.v v1, (t1)
vse32.v v2, (t2) vse32.v v2, (t2)
vse32.v v3, (t3) vse32.v v3, (t3)
@ -329,7 +331,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
addi a1, a1, -8 * 4 addi a1, a1, -8 * 4
li a4, 8 li a4, 8
slli a6, a5, 3 + 2 slli a6, a3, 3 + 2
2: 2:
vsetvli zero, zero, e32, m1, ta, ma vsetvli zero, zero, e32, m1, ta, ma
vlseg8e32.v v0, (a1) vlseg8e32.v v0, (a1)
@ -348,7 +350,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vle16.v v21, (t5) vle16.v v21, (t5)
add a7, t6, a2 add a7, t6, a2
vle16.v v22, (t6) vle16.v v22, (t6)
sub a4, a4, a5 sub a4, a4, a3
vle16.v v23, (a7) vle16.v v23, (a7)
.irp n,0,1,2,3,4,5,6,7 .irp n,0,1,2,3,4,5,6,7
vssra.vi v\n, v\n, 6 vssra.vi v\n, v\n, 6
@ -368,7 +370,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vmax.vx v\n, v\n, zero vmax.vx v\n, v\n, zero
.endr .endr
.irp n,0,1,2,3,4,5,6,7 .irp n,0,1,2,3,4,5,6,7
vmin.vx v\n, v\n, a3 vmin.vx v\n, v\n, a5
.endr .endr
vsetvli zero, zero, e16, mf2, ta, ma vsetvli zero, zero, e16, mf2, ta, ma
vncvt.x.x.w v16, v0 vncvt.x.x.w v16, v0
@ -380,7 +382,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vncvt.x.x.w v22, v6 vncvt.x.x.w v22, v6
vncvt.x.x.w v23, v7 vncvt.x.x.w v23, v7
vse16.v v16, (a0) vse16.v v16, (a0)
sh1add a0, a5, a0 sh1add a0, a3, a0
vse16.v v17, (t1) vse16.v v17, (t1)
vse16.v v18, (t2) vse16.v v18, (t2)
vse16.v v19, (t3) vse16.v v19, (t3)
@ -400,12 +402,12 @@ endfunc
.irp depth, 9, 10, 12, 14 .irp depth, 9, 10, 12, 14
func ff_h264_idct_add_\depth\()_rvv, zve32x func ff_h264_idct_add_\depth\()_rvv, zve32x
li a3, (1 << \depth) - 1 li a5, (1 << \depth) - 1
j ff_h264_idct_add_16_rvv j ff_h264_idct_add_16_rvv
endfunc endfunc
func ff_h264_idct8_add_\depth\()_rvv, zve32x func ff_h264_idct8_add_\depth\()_rvv, zve32x
li a3, (1 << \depth) - 1 li a5, (1 << \depth) - 1
j ff_h264_idct8_add_16_rvv j ff_h264_idct8_add_16_rvv
endfunc endfunc
.endr .endr
@ -416,13 +418,13 @@ const ff_h264_scan8
endconst endconst
#if (__riscv_xlen == 64) #if (__riscv_xlen == 64)
.irp depth, 8 .irp depth, 8, 16
func ff_h264_idct_add16_\depth\()_rvv, zve32x func ff_h264_idct_add16_\depth\()_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
addi sp, sp, -80 addi sp, sp, -96
lla t0, ff_h264_scan8 lla t0, ff_h264_scan8
sd s0, (sp) sd s0, (sp)
li t1, 32 << (\depth > 8) li t1, 32 * (\depth / 8)
mv s0, sp mv s0, sp
sd ra, 8(sp) sd ra, 8(sp)
sd s1, 16(sp) sd s1, 16(sp)
@ -432,9 +434,19 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
sd s5, 48(sp) sd s5, 48(sp)
sd s6, 56(sp) sd s6, 56(sp)
sd s7, 64(sp) sd s7, 64(sp)
.if \depth > 8
sd s8, 72(sp)
sd s9, 80(sp)
mv s8, a5
mv s9, a6
.endif
vsetivli zero, 16, e8, m1, ta, ma vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0) vle8.v v8, (t0)
.if \depth == 8
vlse16.v v16, (a2), t1 vlse16.v v16, (a2), t1
.else
vlse32.v v16, (a2), t1
.endif
vluxei8.v v12, (a4), v8 vluxei8.v v12, (a4), v8
.if \depth == 8 .if \depth == 8
vsetvli zero, zero, e16, m2, ta, ma vsetvli zero, zero, e16, m2, ta, ma
@ -464,17 +476,28 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
mv a1, s6 mv a1, s6
mv a2, s7 mv a2, s7
add a0, s4, t2 add a0, s4, t2
beqz t1, 2f # if (nnz == 1 && block[i * 16]) .if \depth > 8
call ff_h264_idct_dc_add_\depth\()_c mv a5, s8
.endif
bnez t1, 2f # if (nnz == 1 && block[i * 16])
jal .Lidct_add4_\depth\()_rvv
j 3f j 3f
2: 2:
call .Lidct_add4_\depth\()_rvv .if \depth == 8
call ff_h264_idct_dc_add_\depth\()_c
.else
jalr s9
.endif
3: 3:
srli s3, s3, 1 srli s3, s3, 1
addi s5, s5, 4 addi s5, s5, 4
addi s6, s6, 16 * 2 << (\depth > 8) addi s6, s6, 16 * 2 * (\depth / 8)
bnez s1, 1b bnez s1, 1b
.if \depth > 8
ld s9, 80(sp)
ld s8, 72(sp)
.endif
ld s7, 64(sp) ld s7, 64(sp)
ld s6, 56(sp) ld s6, 56(sp)
ld s5, 48(sp) ld s5, 48(sp)
@ -484,16 +507,16 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
ld s1, 16(sp) ld s1, 16(sp)
ld ra, 8(sp) ld ra, 8(sp)
ld s0, 0(sp) ld s0, 0(sp)
addi sp, sp, 80 addi sp, sp, 96
ret ret
endfunc endfunc
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
addi sp, sp, -80 addi sp, sp, -96
lla t0, ff_h264_scan8 lla t0, ff_h264_scan8
sd s0, (sp) sd s0, (sp)
li t1, 32 << (\depth > 8) li t1, 32 * (\depth / 8)
mv s0, sp mv s0, sp
sd ra, 8(sp) sd ra, 8(sp)
sd s1, 16(sp) sd s1, 16(sp)
@ -503,9 +526,19 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
sd s5, 48(sp) sd s5, 48(sp)
sd s6, 56(sp) sd s6, 56(sp)
sd s7, 64(sp) sd s7, 64(sp)
.if \depth > 8
sd s8, 72(sp)
sd s9, 80(sp)
mv s8, a5
mv s9, a6
.endif
vsetivli zero, 16, e8, m1, ta, ma vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0) vle8.v v8, (t0)
.if \depth == 8
vlse16.v v16, (a2), t1 vlse16.v v16, (a2), t1
.else
vlse32.v v16, (a2), t1
.endif
vluxei8.v v12, (a4), v8 vluxei8.v v12, (a4), v8
.if \depth == 8 .if \depth == 8
vsetvli zero, zero, e16, m2, ta, ma vsetvli zero, zero, e16, m2, ta, ma
@ -532,18 +565,29 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
mv a1, s6 mv a1, s6
mv a2, s7 mv a2, s7
add a0, s4, t2 add a0, s4, t2
.if \depth > 8
mv a5, s8
.endif
beqz t0, 2f # if (nnzc[scan8[i]]) beqz t0, 2f # if (nnzc[scan8[i]])
call .Lidct_add4_\depth\()_rvv jal .Lidct_add4_\depth\()_rvv
j 3f j 3f
2: 2:
beqz t1, 3f # if (block[i * 16]) beqz t1, 3f # if (block[i * 16])
.if \depth == 8
call ff_h264_idct_dc_add_\depth\()_c call ff_h264_idct_dc_add_\depth\()_c
.else
jalr s9
.endif
3: 3:
srli s3, s3, 1 srli s3, s3, 1
addi s5, s5, 4 addi s5, s5, 4
addi s6, s6, 16 * 2 << (\depth > 8) addi s6, s6, 16 * 2 * (\depth / 8)
bnez s1, 1b bnez s1, 1b
.if \depth > 8
ld s9, 80(sp)
ld s8, 72(sp)
.endif
ld s7, 64(sp) ld s7, 64(sp)
ld s6, 56(sp) ld s6, 56(sp)
ld s5, 48(sp) ld s5, 48(sp)
@ -553,16 +597,16 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
ld s1, 16(sp) ld s1, 16(sp)
ld ra, 8(sp) ld ra, 8(sp)
ld s0, 0(sp) ld s0, 0(sp)
addi sp, sp, 80 addi sp, sp, 96
ret ret
endfunc endfunc
func ff_h264_idct8_add4_\depth\()_rvv, zve32x func ff_h264_idct8_add4_\depth\()_rvv, zve32x
csrwi vxrm, 0 csrwi vxrm, 0
addi sp, sp, -80 addi sp, sp, -96
lla t0, ff_h264_scan8 lla t0, ff_h264_scan8
sd s0, (sp) sd s0, (sp)
li t1, 4 * 32 << (\depth > 8) li t1, 4 * 32 * (\depth / 8)
mv s0, sp mv s0, sp
li t2, 4 li t2, 4
sd ra, 8(sp) sd ra, 8(sp)
@ -573,9 +617,19 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
sd s5, 48(sp) sd s5, 48(sp)
sd s6, 56(sp) sd s6, 56(sp)
sd s7, 64(sp) sd s7, 64(sp)
.if \depth > 8
sd s8, 72(sp)
sd s9, 80(sp)
mv s8, a5
mv s9, a6
.endif
vsetivli zero, 4, e8, mf4, ta, ma vsetivli zero, 4, e8, mf4, ta, ma
vlse8.v v8, (t0), t2 vlse8.v v8, (t0), t2
.if \depth == 8
vlse16.v v16, (a2), t1 vlse16.v v16, (a2), t1
.else
vlse32.v v16, (a2), t1
.endif
vluxei8.v v12, (a4), v8 vluxei8.v v12, (a4), v8
.if \depth == 8 .if \depth == 8
vsetvli zero, zero, e16, mf2, ta, ma vsetvli zero, zero, e16, mf2, ta, ma
@ -604,17 +658,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
mv a1, s6 mv a1, s6
mv a2, s7 mv a2, s7
add a0, s4, t2 add a0, s4, t2
beqz t1, 2f # if (nnz == 1 && block[i * 16]) .if \depth > 8
call ff_h264_idct8_dc_add_\depth\()_c mv a5, s8
.endif
bnez t1, 2f # if (nnz == 1 && block[i * 16])
jal .Lidct8_add_\depth\()_rvv
j 3f j 3f
2: 2:
call .Lidct8_add_\depth\()_rvv .if \depth == 8
call ff_h264_idct8_dc_add_\depth\()_c
.else
jalr s9
.endif
3: 3:
srli s3, s3, 1 srli s3, s3, 1
addi s5, s5, 4 * 4 addi s5, s5, 4 * 4
addi s6, s6, 4 * 16 * 2 << (\depth > 8) addi s6, s6, 4 * 16 * 2 * (\depth / 8)
bnez s1, 1b bnez s1, 1b
.if \depth > 8
ld s9, 80(sp)
ld s8, 72(sp)
.endif
ld s7, 64(sp) ld s7, 64(sp)
ld s6, 56(sp) ld s6, 56(sp)
ld s5, 48(sp) ld s5, 48(sp)
@ -624,8 +689,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
ld s1, 16(sp) ld s1, 16(sp)
ld ra, 8(sp) ld ra, 8(sp)
ld s0, 0(sp) ld s0, 0(sp)
addi sp, sp, 80 addi sp, sp, 96
ret ret
endfunc endfunc
.endr .endr
.irp depth, 9, 10, 12, 14
func ff_h264_idct_add16_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
lla a6, ff_h264_idct_dc_add_\depth\()_c
j ff_h264_idct_add16_16_rvv
endfunc
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
lla a6, ff_h264_idct_dc_add_\depth\()_c
j ff_h264_idct_add16intra_16_rvv
endfunc
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
lla a6, ff_h264_idct8_dc_add_\depth\()_c
j ff_h264_idct8_add4_16_rvv
endfunc
.endr
#endif #endif