lavc/vc1dsp: fix overflow in R-V V inv_trans_8

The last set of additions/subtractions can break the 16-bit limit, and
require 17 bits of precision. This uses widening adds accordingly to fix
the MSS2 FATE tests.

The problem potentially also affects inv_trans_4 with a very low
probability, but this is not reproducible under FATE.
This commit is contained in:
Rémi Denis-Courmont 2024-06-27 21:21:29 +03:00
parent 2c900d4c11
commit 349c49fd1b

View File

@ -141,44 +141,49 @@ func ff_vc1_inv_trans_8_rvv, zve32x
vadd.vv v20, v20, v21
vadd.vv v22, v22, v23
vsll.vi v21, v3, 2
vadd.vv v16, v20, v22 # t1
vadd.vv v24, v20, v22 # t1
vmul.vx v20, v1, t5
vsll.vi v22, v5, 4
vmul.vx v23, v7, t3
vsub.vv v20, v20, v21
vadd.vv v22, v22, v23
vsll.vi v21, v3, 4
vsub.vv v17, v20, v22 # t2
vsub.vv v25, v20, v22 # t2
vmul.vx v20, v1, t3
vsll.vi v22, v5, 2
vmul.vx v23, v7, t5
vsub.vv v20, v20, v21
vadd.vv v22, v22, v23
vmul.vx v21, v3, t3
vadd.vv v18, v20, v22 # t3
vadd.vv v26, v20, v22 # t3
vsll.vi v20, v1, 2
vmul.vx v22, v5, t5
vsll.vi v23, v7, 4
vsub.vv v20, v20, v21
vsub.vv v22, v22, v23
vadd.vv v27, v20, v22 # t4
srli t2, t1, 2
vadd.vv v0, v28, v16
vadd.vv v19, v20, v22 # t4
vadd.vv v1, v29, v17
vadd.vv v2, v30, v18
vadd.vv v3, v31, v19
vsub.vv v4, v31, v19
vsub.vv v5, v30, v18
vsub.vv v6, v29, v17
vsub.vv v7, v28, v16
vwadd.vv v8, v28, v24
vwadd.vv v10, v29, v25
vwadd.vv v12, v30, v26
vwadd.vv v14, v31, v27
beqz t2, 1f # faster than 4x add t2=zero
.irp n,4,5,6,7
.irp n,31,30,29,28
vadd.vi v\n, v\n, 1
.endr
1:
.irp n,0,1,2,3,4,5,6,7
vssra.vx v\n, v\n, t1
.endr
vwsub.vv v16, v31, v27
vwsub.vv v18, v30, v26
vwsub.vv v20, v29, v25
vwsub.vv v22, v28, v24
vnclip.wx v0, v8, t1
vnclip.wx v1, v10, t1
vnclip.wx v2, v12, t1
vnclip.wx v3, v14, t1
vnclip.wx v4, v16, t1
vnclip.wx v5, v18, t1
vnclip.wx v6, v20, t1
vnclip.wx v7, v22, t1
jr t0
endfunc