lavc/h264dsp: use RISC-V B extension

This saves one register and one instruction per transform.
add16 and add16intra thus become stack-less.
This commit is contained in:
Rémi Denis-Courmont 2024-07-19 22:16:01 +03:00
parent 45d7078a21
commit b62586e310
2 changed files with 38 additions and 38 deletions

View File

@ -98,13 +98,14 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add = ff_h264_idct_add_8_rvv;
dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
if (flags & AV_CPU_FLAG_RVB) {
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
# if __riscv_xlen == 64
dsp->h264_idct_add16 = ff_h264_idct_add16_8_rvv;
dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
dsp->h264_idct8_add4 = ff_h264_idct8_add4_8_rvv;
# endif
if (flags & AV_CPU_FLAG_RVV_I32)
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
}
if (flags & AV_CPU_FLAG_RVV_I64) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_rvv;
@ -118,16 +119,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
if (flags & AV_CPU_FLAG_RVB_ADDR) \
dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
if (zvl128b && (flags & AV_CPU_FLAG_RVB_ADDR)) { \
if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_##depth##_rvv; \
dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
if (__riscv_xlen == 64) { \
dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
dsp->h264_idct_add16intra = \
ff_h264_idct_add16intra_##depth##_rvv; \
} \
} \
if (__riscv_xlen == 64 && zvl128b) { \
dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
dsp->h264_idct_add16intra = \
ff_h264_idct_add16intra_##depth##_rvv; \
} \
if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \
dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
}

View File

@ -532,16 +532,11 @@ const ff_h264_scan8
.byte 034, 035, 044, 045, 036, 037, 046, 047
endconst
#if (__riscv_xlen == 64)
.macro idct4_adds type, depth
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
csrwi vxrm, 0
addi sp, sp, -16
lla t0, ff_h264_scan8
sd s0, (sp)
li t1, 32 * (\depth / 8)
mv s0, sp
sd ra, 8(sp)
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0)
.if \depth == 8
@ -567,20 +562,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
vsetvli zero, zero, e16, m2, ta, ma
vmv.x.s a4, v0
vmv.x.s a7, v1
zext.h a4, a4
slli a7, a7, 16
mv t4, a0
or a4, a4, a7
mv t5, a1
mv a1, a2
mv a2, a3
li a3, 16
mv a7, ra
1:
andi t0, a4, 1
addi a3, a3, -1
srli a4, a4, 1
.ifc \type, 16
beqz t0, 3f # if (nnz)
.endif
lw t2, (t5) # block_offset[i]
andi t1, a7, 1
bexti t1, a4, 16
add a0, t4, t2
.ifc \type, 16
bnez t1, 2f # if (nnz == 1 && block[i * 16])
@ -595,14 +593,12 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
.endif
jal ff_h264_idct4_dc_add_\depth\()_rvv
3:
srli a7, a7, 1
srli a4, a4, 1
addi t5, t5, 4
addi a1, a1, 16 * 2 * (\depth / 8)
bnez a3, 1b
ld ra, 8(sp)
ld s0, 0(sp)
addi sp, sp, 16
mv ra, a7
ret
endfunc
.endm
@ -611,9 +607,10 @@ endfunc
idct4_adds 16, \depth
idct4_adds 16intra, \depth
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
#if (__riscv_xlen == 64)
func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b
csrwi vxrm, 0
addi sp, sp, -64
addi sp, sp, -48
lla t0, ff_h264_scan8
sd s0, (sp)
li t1, 4 * 32 * (\depth / 8)
@ -622,9 +619,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
sd ra, 8(sp)
sd s1, 16(sp)
sd s2, 24(sp)
sd s3, 32(sp)
sd s4, 40(sp)
sd s5, 48(sp)
sd s4, 32(sp)
sd s5, 40(sp)
vsetivli zero, 4, e8, mf4, ta, ma
vlse8.v v8, (t0), t2
.if \depth == 8
@ -644,8 +640,11 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
vmsne.vi v0, v12, 0
vmand.mm v1, v1, v2
vmv.x.s s2, v0
vmv.x.s s3, v1
vmv.x.s a7, v1
zext.h s2, s2
slli a7, a7, 16
li s1, 4
or s2, s2, a7
mv s4, a0
mv s5, a1
mv a1, a2
@ -653,10 +652,9 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
1:
andi t0, s2, 1
addi s1, s1, -1
srli s2, s2, 1
beqz t0, 3f # if (nnz)
lw t2, (s5) # block_offset[i]
andi t1, s3, 1
bexti t1, s2, 16
add a0, s4, t2
bnez t1, 2f # if (nnz == 1 && block[i * 16])
jal .Lidct8_add_\depth\()_rvv
@ -670,20 +668,20 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
3:
addi a1, a1, 4 * 16 * 2 * (\depth / 8)
4:
srli s3, s3, 1
srli s2, s2, 1
addi s5, s5, 4 * 4
bnez s1, 1b
ld s5, 48(sp)
ld s4, 40(sp)
ld s3, 32(sp)
ld s5, 40(sp)
ld s4, 32(sp)
ld s2, 24(sp)
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
addi sp, sp, 64
addi sp, sp, 48
ret
endfunc
#endif
.endr
.irp depth, 9, 10, 12, 14
@ -697,9 +695,10 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
j ff_h264_idct_add16intra_16_rvv
endfunc
#if (__riscv_xlen == 64)
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
li a5, (1 << \depth) - 1
j ff_h264_idct8_add4_16_rvv
endfunc
.endr
#endif
.endr