sws: allow avx2 hscale to process inputs of any size.

The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
This commit is contained in:
Alan Kelly 2022-04-26 10:00:02 +02:00 committed by Anton Khirnov
parent 51a34e8525
commit a6724285fd
1 changed files with 43 additions and 1 deletions

View File

@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
mova m14, [four] mova m14, [four]
shr fltsized, 2 shr fltsized, 2
%endif %endif
cmp wq, 0x10
jl .tail_loop
sub wq, 0x10
.loop: .loop:
movu m1, [fltposq] movu m1, [fltposq]
movu m2, [fltposq+32] movu m2, [fltposq+32]
@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize,
add fltposq, 0x40 add fltposq, 0x40
add countq, 0x10 add countq, 0x10
cmp countq, wq cmp countq, wq
jl .loop jle .loop
add wq, 0x10
cmp countq, wq
jge .end
.tail_loop:
movu xm1, [fltposq]
%ifidn %1, X4
pxor xm9, xm9
pxor xm10, xm10
xor innerq, innerq
.tail_innerloop:
%endif
vpcmpeqd xm13, xm13
vpgatherdd xm3,[srcmemq + xm1], xm13
vpunpcklbw xm5, xm3, xm0
vpunpckhbw xm6, xm3, xm0
vpmaddwd xm5, xm5, [filterq]
vpmaddwd xm6, xm6, [filterq + 0x10]
add filterq, 0x20
%ifidn %1, X4
paddd xm9, xm5
paddd xm10, xm6
paddd xm1, xm14
add innerq, 1
cmp innerq, fltsizeq
jl .tail_innerloop
vphaddd xm5, xm9, xm10
%else
vphaddd xm5, xm5, xm6
%endif
vpsrad xm5, 7
vpackssdw xm5, xm5, xm5
vmovq [dstq + countq * 2], xm5
add fltposq, 0x10
add countq, 0x4
cmp countq, wq
jl .tail_loop
.end:
REP_RET REP_RET
%endmacro %endmacro