sws/rgb2rgb: fix unaligned accesses in R-V V YUYV to I422p

In my personal opinion, we should not need to support unaligned YUY2
pixel maps. They should always be aligned to at least 32 bits, and the
current code assumes just 16 bits. However checkasm does test for
unaligned input bitmaps. QEMU accepts it, but real hardware dose not.

In this particular case, we can at the same time improve performance and
handle unaligned inputs, so do just that.

uyvytoyuv422_c:      104379.0
uyvytoyuv422_c:      104060.0
uyvytoyuv422_rvv_i32: 25284.0 (before)
uyvytoyuv422_rvv_i32: 19303.2 (after)
This commit is contained in:
Rémi Denis-Courmont 2023-11-09 20:19:47 +02:00
parent 5b8b5ec9c5
commit 6d60cc7baf
2 changed files with 29 additions and 24 deletions

View File

@ -55,8 +55,10 @@ av_cold void rgb2rgb_init_riscv(void)
shuffle_bytes_1230 = ff_shuffle_bytes_1230_rvv;
shuffle_bytes_3012 = ff_shuffle_bytes_3012_rvv;
interleaveBytes = ff_interleave_bytes_rvv;
uyvytoyuv422 = ff_uyvytoyuv422_rvv;
yuyvtoyuv422 = ff_yuyvtoyuv422_rvv;
if (flags & AV_CPU_FLAG_RVB_BASIC) {
uyvytoyuv422 = ff_uyvytoyuv422_rvv;
yuyvtoyuv422 = ff_yuyvtoyuv422_rvv;
}
}
#endif
}

View File

@ -126,32 +126,35 @@ func ff_deinterleave_bytes_rvv, zve32x
ret
endfunc
.macro yuy2_to_i422p y_shift
slli t4, a4, 1 // pixel width -> (source) byte width
.macro yuy2_to_i422p luma, chroma
srai t4, a4, 1 // pixel width -> chroma width
lw t6, (sp)
slli t5, a4, 1 // pixel width -> (source) byte width
sub a6, a6, a4
srai a4, a4, 1 // pixel width -> chroma width
sub a7, a7, a4
sub t6, t6, t4
sub a7, a7, t4
sub t6, t6, t5
vsetvli t2, zero, e8, m4, ta, ma
1:
mv t4, a4
addi a5, a5, -1
2:
vsetvli t5, t4, e8, m2, ta, ma
vlseg2e16.v v16, (a3)
sub t4, t4, t5
vnsrl.wi v24, v16, \y_shift // Y0
sh2add a3, t5, a3
vnsrl.wi v26, v20, \y_shift // Y1
vnsrl.wi v28, v16, 8 - \y_shift // U
vnsrl.wi v30, v20, 8 - \y_shift // V
vsseg2e8.v v24, (a0)
sh1add a0, t5, a0
vse8.v v28, (a1)
add a1, t5, a1
vse8.v v30, (a2)
add a2, t5, a2
bnez t4, 2b
min t0, t2, t4 // ensure even VL on penultimate iteration
vsetvli t0, t0, e8, m4, ta, ma
vlseg2e8.v v16, (a3)
srli t1, t0, 1
vsetvli zero, t1, e8, m2, ta, ma
vnsrl.wi v24, \chroma, 0 // U
sub t4, t4, t0
vnsrl.wi v28, \chroma, 8 // V
sh1add a3, t0, a3
vse8.v v24, (a1)
add a1, t1, a1
vse8.v v28, (a2)
add a2, t1, a2
vsetvli zero, t0, e8, m4, ta, ma
vse8.v \luma, (a0)
add a0, t0, a0
bnez t4, 2b
add a3, a3, t6
add a0, a0, a6
@ -163,9 +166,9 @@ endfunc
.endm
func ff_uyvytoyuv422_rvv, zve32x
yuy2_to_i422p 8
yuy2_to_i422p v20, v16
endfunc
func ff_yuyvtoyuv422_rvv, zve32x
yuy2_to_i422p 0
yuy2_to_i422p v16, v20
endfunc