lavc/vc1dsp: R-V V vc1_inv_trans_4x4

T-Head C908 (cycles):
vc1dsp.vc1_inv_trans_4x4_c: 310.7
vc1dsp.vc1_inv_trans_4x4_rvv_i32: 120.0

We could use 1 `vlseg4e64.v` instead of 4 `vle16.v`, but that seems to
be about 7% slower.
This commit is contained in:
Rémi Denis-Courmont 2024-06-04 21:33:53 +03:00
parent 6ffa639c8a
commit 3152c684cb
2 changed files with 47 additions and 0 deletions

View File

@ -32,6 +32,7 @@ void ff_vc1_inv_trans_4x8_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_8x4_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_vc1_inv_trans_4x4_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
void ff_put_pixels8x8_rvi(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t line_size, int rnd);
@ -59,6 +60,7 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_rvv;
dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_rvv;
dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_rvv;
dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_rvv;
dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;

View File

@ -373,6 +373,51 @@ func ff_vc1_inv_trans_4x8_rvv, zve32x
ret
endfunc
func ff_vc1_inv_trans_4x4_rvv, zve32x
li a3, 8 * 2
csrwi vxrm, 0
vsetivli zero, 4, e16, mf2, ta, ma
vlsseg4e16.v v0, (a2), a3
li t1, 3
jal t0, ff_vc1_inv_trans_4_rvv
vsseg4e16.v v0, (a2)
addi t1, a2, 1 * 4 * 2
vle16.v v0, (a2)
addi t2, a2, 2 * 4 * 2
vle16.v v1, (t1)
addi t3, a2, 3 * 4 * 2
vle16.v v2, (t2)
vle16.v v3, (t3)
li t1, 7
jal t0, ff_vc1_inv_trans_4_rvv
add t1, a1, a0
vle8.v v8, (a0)
add t2, a1, t1
vle8.v v9, (t1)
add t3, a1, t2
vle8.v v10, (t2)
vle8.v v11, (t3)
vsetvli zero, zero, e8, mf4, ta, ma
vwaddu.wv v0, v0, v8
vwaddu.wv v1, v1, v9
vwaddu.wv v2, v2, v10
vwaddu.wv v3, v3, v11
vsetvli zero, zero, e16, mf2, ta, ma
.irp n,0,1,2,3
vmax.vx v\n, v\n, zero
.endr
vsetvli zero, zero, e8, mf4, ta, ma
vnclipu.wi v8, v0, 0
vnclipu.wi v9, v1, 0
vse8.v v8, (a0)
vnclipu.wi v10, v2, 0
vse8.v v9, (t1)
vnclipu.wi v11, v3, 0
vse8.v v10, (t2)
vse8.v v11, (t3)
ret
endfunc
.macro mspel_op op pos n1 n2
add t1, \pos, a2
v\op\()e8.v v\n1, (\pos)