mirror of https://git.ffmpeg.org/ffmpeg.git
lavc/h264dsp: R-V V high-depth idct_add{,intra}16, idct8_add4
As with 8-bit, this tends to be faster, but results are all over the place due to the variable distribution of non-zero coefficients.
This commit is contained in:
parent
90672974bd
commit
483fd732ab
|
@ -41,26 +41,27 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
|
||||||
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
|
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t *tc0);
|
int alpha, int beta, int8_t *tc0);
|
||||||
|
|
||||||
void ff_h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
#define IDCT_DEPTH(depth) \
|
||||||
void ff_h264_idct8_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
|
||||||
void ff_h264_idct_add16_8_rvv(uint8_t *dst, const int *blockoffset,
|
void ff_h264_idct8_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
|
||||||
int16_t *block, int stride,
|
void ff_h264_idct4_dc_add_##depth##_rvv(uint8_t *, int16_t *, int); \
|
||||||
const uint8_t nnzc[5 * 8]);
|
void ff_h264_idct8_dc_add_##depth##_rvv(uint8_t *, int16_t *, int); \
|
||||||
void ff_h264_idct_add16intra_8_rvv(uint8_t *dst, const int *blockoffset,
|
void ff_h264_idct_add16_##depth##_rvv(uint8_t *d, const int *soffset, \
|
||||||
int16_t *block, int stride,
|
int16_t *s, int stride, \
|
||||||
const uint8_t nnzc[5 * 8]);
|
const uint8_t nnzc[5 * 8]); \
|
||||||
void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset,
|
void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
|
||||||
int16_t *block, int stride,
|
int16_t *s, int stride, \
|
||||||
const uint8_t nnzc[5 * 8]);
|
const uint8_t nnzc[5 * 8]); \
|
||||||
|
void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
|
||||||
|
int16_t *s, int stride, \
|
||||||
|
const uint8_t nnzc[5 * 8]);
|
||||||
|
|
||||||
void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
|
IDCT_DEPTH(8)
|
||||||
void ff_h264_idct8_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
|
IDCT_DEPTH(9)
|
||||||
void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
|
IDCT_DEPTH(10)
|
||||||
void ff_h264_idct8_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
|
IDCT_DEPTH(12)
|
||||||
void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
|
IDCT_DEPTH(14)
|
||||||
void ff_h264_idct8_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
|
#undef IDCT_DEPTH
|
||||||
void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
|
|
||||||
void ff_h264_idct8_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
|
|
||||||
|
|
||||||
void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
||||||
void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
@ -69,16 +70,6 @@ void ff_h264_add_pixels4_16_rvv(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
|
||||||
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
|
extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
|
||||||
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
|
extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
|
||||||
void ff_h264_idct4_dc_add_8_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct8_dc_add_8_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct4_dc_add_9_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct8_dc_add_9_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct4_dc_add_10_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct8_dc_add_10_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct4_dc_add_12_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct8_dc_add_12_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct4_dc_add_14_rvv(uint8_t *, int16_t *, int);
|
|
||||||
void ff_h264_idct8_dc_add_14_rvv(uint8_t *, int16_t *, int);
|
|
||||||
|
|
||||||
av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
||||||
const int chroma_format_idc)
|
const int chroma_format_idc)
|
||||||
|
@ -120,38 +111,31 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
||||||
}
|
}
|
||||||
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
|
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
|
||||||
}
|
}
|
||||||
if (bit_depth == 9) {
|
|
||||||
if (zvl128b) {
|
#define IDCT_DEPTH(depth) \
|
||||||
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_9_rvv;
|
if (bit_depth == depth) { \
|
||||||
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_9_rvv;
|
if (zvl128b) \
|
||||||
dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
|
dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
|
||||||
}
|
if (flags & AV_CPU_FLAG_RVB_ADDR) \
|
||||||
dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv;
|
dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
|
||||||
}
|
if (zvl128b && (flags & AV_CPU_FLAG_RVB_ADDR)) { \
|
||||||
if (bit_depth == 10) {
|
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \
|
||||||
if (zvl128b) {
|
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
|
||||||
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_10_rvv;
|
} \
|
||||||
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_rvv;
|
if (__riscv_xlen == 64 && zvl128b) { \
|
||||||
dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
|
dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
|
||||||
}
|
dsp->h264_idct_add16intra = \
|
||||||
dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv;
|
ff_h264_idct_add16intra_##depth##_rvv; \
|
||||||
}
|
} \
|
||||||
if (bit_depth == 12) {
|
if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
|
||||||
if (zvl128b) {
|
dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
|
||||||
dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
|
|
||||||
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_12_rvv;
|
|
||||||
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_12_rvv;
|
|
||||||
}
|
|
||||||
dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
|
|
||||||
}
|
|
||||||
if (bit_depth == 14) {
|
|
||||||
if (zvl128b) {
|
|
||||||
dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
|
|
||||||
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_14_rvv;
|
|
||||||
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_14_rvv;
|
|
||||||
}
|
|
||||||
dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
IDCT_DEPTH(9)
|
||||||
|
IDCT_DEPTH(10)
|
||||||
|
IDCT_DEPTH(12)
|
||||||
|
IDCT_DEPTH(14)
|
||||||
|
|
||||||
if (bit_depth > 8 && zvl128b) {
|
if (bit_depth > 8 && zvl128b) {
|
||||||
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv;
|
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv;
|
||||||
if (flags & AV_CPU_FLAG_RVV_I64)
|
if (flags & AV_CPU_FLAG_RVV_I64)
|
||||||
|
|
|
@ -107,6 +107,7 @@ endfunc
|
||||||
|
|
||||||
func ff_h264_idct_add_16_rvv, zve32x
|
func ff_h264_idct_add_16_rvv, zve32x
|
||||||
csrwi vxrm, 0
|
csrwi vxrm, 0
|
||||||
|
.Lidct_add4_16_rvv:
|
||||||
vsetivli zero, 4, e32, m1, ta, ma
|
vsetivli zero, 4, e32, m1, ta, ma
|
||||||
addi t1, a1, 1 * 4 * 4
|
addi t1, a1, 1 * 4 * 4
|
||||||
vle32.v v0, (a1)
|
vle32.v v0, (a1)
|
||||||
|
@ -147,7 +148,7 @@ func ff_h264_idct_add_16_rvv, zve32x
|
||||||
vmax.vx v\n, v\n, zero
|
vmax.vx v\n, v\n, zero
|
||||||
.endr
|
.endr
|
||||||
.irp n,0,1,2,3
|
.irp n,0,1,2,3
|
||||||
vmin.vx v\n, v\n, a3
|
vmin.vx v\n, v\n, a5
|
||||||
.endr
|
.endr
|
||||||
vsetvli zero, zero, e16, mf2, ta, ma
|
vsetvli zero, zero, e16, mf2, ta, ma
|
||||||
vncvt.x.x.w v4, v0
|
vncvt.x.x.w v4, v0
|
||||||
|
@ -295,9 +296,10 @@ func ff_h264_idct8_add_8_rvv, zve32x
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
func ff_h264_idct8_add_16_rvv, zve32x
|
func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
li a4, 8
|
|
||||||
csrwi vxrm, 0
|
csrwi vxrm, 0
|
||||||
vsetivli a5, 8, e32, m1, ta, ma
|
.Lidct8_add_16_rvv:
|
||||||
|
li a4, 8
|
||||||
|
vsetivli a3, 8, e32, m1, ta, ma
|
||||||
1:
|
1:
|
||||||
addi t1, a1, 1 * 8 * 4
|
addi t1, a1, 1 * 8 * 4
|
||||||
vle32.v v0, (a1)
|
vle32.v v0, (a1)
|
||||||
|
@ -313,11 +315,11 @@ func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
vle32.v v5, (t5)
|
vle32.v v5, (t5)
|
||||||
addi a7, a1, 7 * 8 * 4
|
addi a7, a1, 7 * 8 * 4
|
||||||
vle32.v v6, (t6)
|
vle32.v v6, (t6)
|
||||||
sub a4, a4, a5
|
sub a4, a4, a3
|
||||||
vle32.v v7, (a7)
|
vle32.v v7, (a7)
|
||||||
jal t0, ff_h264_idct8_rvv
|
jal t0, ff_h264_idct8_rvv
|
||||||
vse32.v v0, (a1)
|
vse32.v v0, (a1)
|
||||||
sh2add a1, a5, a1
|
sh2add a1, a3, a1
|
||||||
vse32.v v1, (t1)
|
vse32.v v1, (t1)
|
||||||
vse32.v v2, (t2)
|
vse32.v v2, (t2)
|
||||||
vse32.v v3, (t3)
|
vse32.v v3, (t3)
|
||||||
|
@ -329,7 +331,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
|
|
||||||
addi a1, a1, -8 * 4
|
addi a1, a1, -8 * 4
|
||||||
li a4, 8
|
li a4, 8
|
||||||
slli a6, a5, 3 + 2
|
slli a6, a3, 3 + 2
|
||||||
2:
|
2:
|
||||||
vsetvli zero, zero, e32, m1, ta, ma
|
vsetvli zero, zero, e32, m1, ta, ma
|
||||||
vlseg8e32.v v0, (a1)
|
vlseg8e32.v v0, (a1)
|
||||||
|
@ -348,7 +350,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
vle16.v v21, (t5)
|
vle16.v v21, (t5)
|
||||||
add a7, t6, a2
|
add a7, t6, a2
|
||||||
vle16.v v22, (t6)
|
vle16.v v22, (t6)
|
||||||
sub a4, a4, a5
|
sub a4, a4, a3
|
||||||
vle16.v v23, (a7)
|
vle16.v v23, (a7)
|
||||||
.irp n,0,1,2,3,4,5,6,7
|
.irp n,0,1,2,3,4,5,6,7
|
||||||
vssra.vi v\n, v\n, 6
|
vssra.vi v\n, v\n, 6
|
||||||
|
@ -368,7 +370,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
vmax.vx v\n, v\n, zero
|
vmax.vx v\n, v\n, zero
|
||||||
.endr
|
.endr
|
||||||
.irp n,0,1,2,3,4,5,6,7
|
.irp n,0,1,2,3,4,5,6,7
|
||||||
vmin.vx v\n, v\n, a3
|
vmin.vx v\n, v\n, a5
|
||||||
.endr
|
.endr
|
||||||
vsetvli zero, zero, e16, mf2, ta, ma
|
vsetvli zero, zero, e16, mf2, ta, ma
|
||||||
vncvt.x.x.w v16, v0
|
vncvt.x.x.w v16, v0
|
||||||
|
@ -380,7 +382,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
|
||||||
vncvt.x.x.w v22, v6
|
vncvt.x.x.w v22, v6
|
||||||
vncvt.x.x.w v23, v7
|
vncvt.x.x.w v23, v7
|
||||||
vse16.v v16, (a0)
|
vse16.v v16, (a0)
|
||||||
sh1add a0, a5, a0
|
sh1add a0, a3, a0
|
||||||
vse16.v v17, (t1)
|
vse16.v v17, (t1)
|
||||||
vse16.v v18, (t2)
|
vse16.v v18, (t2)
|
||||||
vse16.v v19, (t3)
|
vse16.v v19, (t3)
|
||||||
|
@ -400,12 +402,12 @@ endfunc
|
||||||
|
|
||||||
.irp depth, 9, 10, 12, 14
|
.irp depth, 9, 10, 12, 14
|
||||||
func ff_h264_idct_add_\depth\()_rvv, zve32x
|
func ff_h264_idct_add_\depth\()_rvv, zve32x
|
||||||
li a3, (1 << \depth) - 1
|
li a5, (1 << \depth) - 1
|
||||||
j ff_h264_idct_add_16_rvv
|
j ff_h264_idct_add_16_rvv
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
func ff_h264_idct8_add_\depth\()_rvv, zve32x
|
func ff_h264_idct8_add_\depth\()_rvv, zve32x
|
||||||
li a3, (1 << \depth) - 1
|
li a5, (1 << \depth) - 1
|
||||||
j ff_h264_idct8_add_16_rvv
|
j ff_h264_idct8_add_16_rvv
|
||||||
endfunc
|
endfunc
|
||||||
.endr
|
.endr
|
||||||
|
@ -416,13 +418,13 @@ const ff_h264_scan8
|
||||||
endconst
|
endconst
|
||||||
|
|
||||||
#if (__riscv_xlen == 64)
|
#if (__riscv_xlen == 64)
|
||||||
.irp depth, 8
|
.irp depth, 8, 16
|
||||||
func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||||
csrwi vxrm, 0
|
csrwi vxrm, 0
|
||||||
addi sp, sp, -80
|
addi sp, sp, -96
|
||||||
lla t0, ff_h264_scan8
|
lla t0, ff_h264_scan8
|
||||||
sd s0, (sp)
|
sd s0, (sp)
|
||||||
li t1, 32 << (\depth > 8)
|
li t1, 32 * (\depth / 8)
|
||||||
mv s0, sp
|
mv s0, sp
|
||||||
sd ra, 8(sp)
|
sd ra, 8(sp)
|
||||||
sd s1, 16(sp)
|
sd s1, 16(sp)
|
||||||
|
@ -432,9 +434,19 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||||
sd s5, 48(sp)
|
sd s5, 48(sp)
|
||||||
sd s6, 56(sp)
|
sd s6, 56(sp)
|
||||||
sd s7, 64(sp)
|
sd s7, 64(sp)
|
||||||
|
.if \depth > 8
|
||||||
|
sd s8, 72(sp)
|
||||||
|
sd s9, 80(sp)
|
||||||
|
mv s8, a5
|
||||||
|
mv s9, a6
|
||||||
|
.endif
|
||||||
vsetivli zero, 16, e8, m1, ta, ma
|
vsetivli zero, 16, e8, m1, ta, ma
|
||||||
vle8.v v8, (t0)
|
vle8.v v8, (t0)
|
||||||
|
.if \depth == 8
|
||||||
vlse16.v v16, (a2), t1
|
vlse16.v v16, (a2), t1
|
||||||
|
.else
|
||||||
|
vlse32.v v16, (a2), t1
|
||||||
|
.endif
|
||||||
vluxei8.v v12, (a4), v8
|
vluxei8.v v12, (a4), v8
|
||||||
.if \depth == 8
|
.if \depth == 8
|
||||||
vsetvli zero, zero, e16, m2, ta, ma
|
vsetvli zero, zero, e16, m2, ta, ma
|
||||||
|
@ -464,17 +476,28 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||||
mv a1, s6
|
mv a1, s6
|
||||||
mv a2, s7
|
mv a2, s7
|
||||||
add a0, s4, t2
|
add a0, s4, t2
|
||||||
beqz t1, 2f # if (nnz == 1 && block[i * 16])
|
.if \depth > 8
|
||||||
call ff_h264_idct_dc_add_\depth\()_c
|
mv a5, s8
|
||||||
|
.endif
|
||||||
|
bnez t1, 2f # if (nnz == 1 && block[i * 16])
|
||||||
|
jal .Lidct_add4_\depth\()_rvv
|
||||||
j 3f
|
j 3f
|
||||||
2:
|
2:
|
||||||
call .Lidct_add4_\depth\()_rvv
|
.if \depth == 8
|
||||||
|
call ff_h264_idct_dc_add_\depth\()_c
|
||||||
|
.else
|
||||||
|
jalr s9
|
||||||
|
.endif
|
||||||
3:
|
3:
|
||||||
srli s3, s3, 1
|
srli s3, s3, 1
|
||||||
addi s5, s5, 4
|
addi s5, s5, 4
|
||||||
addi s6, s6, 16 * 2 << (\depth > 8)
|
addi s6, s6, 16 * 2 * (\depth / 8)
|
||||||
bnez s1, 1b
|
bnez s1, 1b
|
||||||
|
|
||||||
|
.if \depth > 8
|
||||||
|
ld s9, 80(sp)
|
||||||
|
ld s8, 72(sp)
|
||||||
|
.endif
|
||||||
ld s7, 64(sp)
|
ld s7, 64(sp)
|
||||||
ld s6, 56(sp)
|
ld s6, 56(sp)
|
||||||
ld s5, 48(sp)
|
ld s5, 48(sp)
|
||||||
|
@ -484,16 +507,16 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||||
ld s1, 16(sp)
|
ld s1, 16(sp)
|
||||||
ld ra, 8(sp)
|
ld ra, 8(sp)
|
||||||
ld s0, 0(sp)
|
ld s0, 0(sp)
|
||||||
addi sp, sp, 80
|
addi sp, sp, 96
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||||
csrwi vxrm, 0
|
csrwi vxrm, 0
|
||||||
addi sp, sp, -80
|
addi sp, sp, -96
|
||||||
lla t0, ff_h264_scan8
|
lla t0, ff_h264_scan8
|
||||||
sd s0, (sp)
|
sd s0, (sp)
|
||||||
li t1, 32 << (\depth > 8)
|
li t1, 32 * (\depth / 8)
|
||||||
mv s0, sp
|
mv s0, sp
|
||||||
sd ra, 8(sp)
|
sd ra, 8(sp)
|
||||||
sd s1, 16(sp)
|
sd s1, 16(sp)
|
||||||
|
@ -503,9 +526,19 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||||
sd s5, 48(sp)
|
sd s5, 48(sp)
|
||||||
sd s6, 56(sp)
|
sd s6, 56(sp)
|
||||||
sd s7, 64(sp)
|
sd s7, 64(sp)
|
||||||
|
.if \depth > 8
|
||||||
|
sd s8, 72(sp)
|
||||||
|
sd s9, 80(sp)
|
||||||
|
mv s8, a5
|
||||||
|
mv s9, a6
|
||||||
|
.endif
|
||||||
vsetivli zero, 16, e8, m1, ta, ma
|
vsetivli zero, 16, e8, m1, ta, ma
|
||||||
vle8.v v8, (t0)
|
vle8.v v8, (t0)
|
||||||
|
.if \depth == 8
|
||||||
vlse16.v v16, (a2), t1
|
vlse16.v v16, (a2), t1
|
||||||
|
.else
|
||||||
|
vlse32.v v16, (a2), t1
|
||||||
|
.endif
|
||||||
vluxei8.v v12, (a4), v8
|
vluxei8.v v12, (a4), v8
|
||||||
.if \depth == 8
|
.if \depth == 8
|
||||||
vsetvli zero, zero, e16, m2, ta, ma
|
vsetvli zero, zero, e16, m2, ta, ma
|
||||||
|
@ -532,18 +565,29 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||||
mv a1, s6
|
mv a1, s6
|
||||||
mv a2, s7
|
mv a2, s7
|
||||||
add a0, s4, t2
|
add a0, s4, t2
|
||||||
|
.if \depth > 8
|
||||||
|
mv a5, s8
|
||||||
|
.endif
|
||||||
beqz t0, 2f # if (nnzc[scan8[i]])
|
beqz t0, 2f # if (nnzc[scan8[i]])
|
||||||
call .Lidct_add4_\depth\()_rvv
|
jal .Lidct_add4_\depth\()_rvv
|
||||||
j 3f
|
j 3f
|
||||||
2:
|
2:
|
||||||
beqz t1, 3f # if (block[i * 16])
|
beqz t1, 3f # if (block[i * 16])
|
||||||
|
.if \depth == 8
|
||||||
call ff_h264_idct_dc_add_\depth\()_c
|
call ff_h264_idct_dc_add_\depth\()_c
|
||||||
|
.else
|
||||||
|
jalr s9
|
||||||
|
.endif
|
||||||
3:
|
3:
|
||||||
srli s3, s3, 1
|
srli s3, s3, 1
|
||||||
addi s5, s5, 4
|
addi s5, s5, 4
|
||||||
addi s6, s6, 16 * 2 << (\depth > 8)
|
addi s6, s6, 16 * 2 * (\depth / 8)
|
||||||
bnez s1, 1b
|
bnez s1, 1b
|
||||||
|
|
||||||
|
.if \depth > 8
|
||||||
|
ld s9, 80(sp)
|
||||||
|
ld s8, 72(sp)
|
||||||
|
.endif
|
||||||
ld s7, 64(sp)
|
ld s7, 64(sp)
|
||||||
ld s6, 56(sp)
|
ld s6, 56(sp)
|
||||||
ld s5, 48(sp)
|
ld s5, 48(sp)
|
||||||
|
@ -553,16 +597,16 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||||
ld s1, 16(sp)
|
ld s1, 16(sp)
|
||||||
ld ra, 8(sp)
|
ld ra, 8(sp)
|
||||||
ld s0, 0(sp)
|
ld s0, 0(sp)
|
||||||
addi sp, sp, 80
|
addi sp, sp, 96
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
|
|
||||||
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||||
csrwi vxrm, 0
|
csrwi vxrm, 0
|
||||||
addi sp, sp, -80
|
addi sp, sp, -96
|
||||||
lla t0, ff_h264_scan8
|
lla t0, ff_h264_scan8
|
||||||
sd s0, (sp)
|
sd s0, (sp)
|
||||||
li t1, 4 * 32 << (\depth > 8)
|
li t1, 4 * 32 * (\depth / 8)
|
||||||
mv s0, sp
|
mv s0, sp
|
||||||
li t2, 4
|
li t2, 4
|
||||||
sd ra, 8(sp)
|
sd ra, 8(sp)
|
||||||
|
@ -573,9 +617,19 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||||
sd s5, 48(sp)
|
sd s5, 48(sp)
|
||||||
sd s6, 56(sp)
|
sd s6, 56(sp)
|
||||||
sd s7, 64(sp)
|
sd s7, 64(sp)
|
||||||
|
.if \depth > 8
|
||||||
|
sd s8, 72(sp)
|
||||||
|
sd s9, 80(sp)
|
||||||
|
mv s8, a5
|
||||||
|
mv s9, a6
|
||||||
|
.endif
|
||||||
vsetivli zero, 4, e8, mf4, ta, ma
|
vsetivli zero, 4, e8, mf4, ta, ma
|
||||||
vlse8.v v8, (t0), t2
|
vlse8.v v8, (t0), t2
|
||||||
|
.if \depth == 8
|
||||||
vlse16.v v16, (a2), t1
|
vlse16.v v16, (a2), t1
|
||||||
|
.else
|
||||||
|
vlse32.v v16, (a2), t1
|
||||||
|
.endif
|
||||||
vluxei8.v v12, (a4), v8
|
vluxei8.v v12, (a4), v8
|
||||||
.if \depth == 8
|
.if \depth == 8
|
||||||
vsetvli zero, zero, e16, mf2, ta, ma
|
vsetvli zero, zero, e16, mf2, ta, ma
|
||||||
|
@ -604,17 +658,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||||
mv a1, s6
|
mv a1, s6
|
||||||
mv a2, s7
|
mv a2, s7
|
||||||
add a0, s4, t2
|
add a0, s4, t2
|
||||||
beqz t1, 2f # if (nnz == 1 && block[i * 16])
|
.if \depth > 8
|
||||||
call ff_h264_idct8_dc_add_\depth\()_c
|
mv a5, s8
|
||||||
|
.endif
|
||||||
|
bnez t1, 2f # if (nnz == 1 && block[i * 16])
|
||||||
|
jal .Lidct8_add_\depth\()_rvv
|
||||||
j 3f
|
j 3f
|
||||||
2:
|
2:
|
||||||
call .Lidct8_add_\depth\()_rvv
|
.if \depth == 8
|
||||||
|
call ff_h264_idct8_dc_add_\depth\()_c
|
||||||
|
.else
|
||||||
|
jalr s9
|
||||||
|
.endif
|
||||||
3:
|
3:
|
||||||
srli s3, s3, 1
|
srli s3, s3, 1
|
||||||
addi s5, s5, 4 * 4
|
addi s5, s5, 4 * 4
|
||||||
addi s6, s6, 4 * 16 * 2 << (\depth > 8)
|
addi s6, s6, 4 * 16 * 2 * (\depth / 8)
|
||||||
bnez s1, 1b
|
bnez s1, 1b
|
||||||
|
|
||||||
|
.if \depth > 8
|
||||||
|
ld s9, 80(sp)
|
||||||
|
ld s8, 72(sp)
|
||||||
|
.endif
|
||||||
ld s7, 64(sp)
|
ld s7, 64(sp)
|
||||||
ld s6, 56(sp)
|
ld s6, 56(sp)
|
||||||
ld s5, 48(sp)
|
ld s5, 48(sp)
|
||||||
|
@ -624,8 +689,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||||
ld s1, 16(sp)
|
ld s1, 16(sp)
|
||||||
ld ra, 8(sp)
|
ld ra, 8(sp)
|
||||||
ld s0, 0(sp)
|
ld s0, 0(sp)
|
||||||
addi sp, sp, 80
|
addi sp, sp, 96
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
.endr
|
.endr
|
||||||
|
|
||||||
|
.irp depth, 9, 10, 12, 14
|
||||||
|
func ff_h264_idct_add16_\depth\()_rvv, zve32x
|
||||||
|
li a5, (1 << \depth) - 1
|
||||||
|
lla a6, ff_h264_idct_dc_add_\depth\()_c
|
||||||
|
j ff_h264_idct_add16_16_rvv
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
|
||||||
|
li a5, (1 << \depth) - 1
|
||||||
|
lla a6, ff_h264_idct_dc_add_\depth\()_c
|
||||||
|
j ff_h264_idct_add16intra_16_rvv
|
||||||
|
endfunc
|
||||||
|
|
||||||
|
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
|
||||||
|
li a5, (1 << \depth) - 1
|
||||||
|
lla a6, ff_h264_idct8_dc_add_\depth\()_c
|
||||||
|
j ff_h264_idct8_add4_16_rvv
|
||||||
|
endfunc
|
||||||
|
.endr
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue