lavc/vp8dsp: R-V V vp8_idct_add

T-Head C908 (cycles):
vp8_idct_add_c:       312.2
vp8_idct_add_rvv_i32: 117.0
This commit is contained in:
Rémi Denis-Courmont 2024-06-05 21:55:22 +03:00
parent e0f4d185f1
commit 658439934b
2 changed files with 61 additions and 0 deletions

View File

@ -27,6 +27,7 @@
#include "vp8dsp.h"
void ff_vp8_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
@ -129,6 +130,7 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
if (flags & AV_CPU_FLAG_RVV_I64)
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_rvv;
#endif
c->vp8_idct_add = ff_vp8_idct_add_rvv;
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv;
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
if (flags & AV_CPU_FLAG_RVV_I64)

View File

@ -98,6 +98,65 @@ func ff_vp8_luma_dc_wht_rvv, zve64x
endfunc
#endif
func ff_vp8_idct_add_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e16, mf2, ta, ma
addi a3, a1, 1 * 4 * 2
addi a4, a1, 2 * 4 * 2
addi a5, a1, 3 * 4 * 2
li t1, 20091
li t2, 35468
jal t0, 1f
vsseg4e16.v v0, (a1)
jal t0, 1f
vlsseg4e8.v v4, (a0), a2
vssra.vi v0, v0, 3
sd zero, (a1)
vssra.vi v1, v1, 3
sd zero, 8(a1)
vssra.vi v2, v2, 3
sd zero, 16(a1)
vssra.vi v3, v3, 3
sd zero, 24(a1)
vsetvli zero, zero, e8, mf4, ta, ma
vwaddu.wv v0, v0, v4
vwaddu.wv v1, v1, v5
vwaddu.wv v2, v2, v6
vwaddu.wv v3, v3, v7
vsetvli zero, zero, e16, mf2, ta, ma
vmax.vx v0, v0, zero
vmax.vx v1, v1, zero
vmax.vx v2, v2, zero
vmax.vx v3, v3, zero
vsetvli zero, zero, e8, mf4, ta, ma
vnclipu.wi v4, v0, 0
vnclipu.wi v5, v1, 0
vnclipu.wi v6, v2, 0
vnclipu.wi v7, v3, 0
vssseg4e8.v v4, (a0), a2
ret
1:
vle16.v v0, (a1)
vle16.v v2, (a4)
vle16.v v1, (a3)
vle16.v v3, (a5)
vadd.vv v4, v0, v2 # t0
vsub.vv v5, v0, v2 # t1
vmulhsu.vx v8, v3, t1
vmulhsu.vx v6, v1, t2
vadd.vv v8, v8, v3
vmulhsu.vx v7, v1, t1
vmulhsu.vx v9, v3, t2
vadd.vv v7, v7, v1
vsub.vv v6, v6, v8 # t2
vadd.vv v7, v7, v9 # t3
vadd.vv v1, v5, v6
vsub.vv v2, v5, v6
vadd.vv v0, v4, v7
vsub.vv v3, v4, v7
jr t0
endfunc
func ff_vp8_idct_dc_add_rvv, zve32x
lh a3, (a1)
addi a3, a3, 4