libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
Alan Kelly 2021-04-01 12:00:15 +02:00 committed by Michael Niedermayer
parent dc57762cb4
commit 3ce8d09244
1 changed files with 13 additions and 1 deletions

View File

@ -37,8 +37,10 @@ SECTION .text
cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
%if notcpuflag(sse3) %if notcpuflag(sse3)
%define movr mova %define movr mova
%define unroll 1
%else %else
%define movr movdqu %define movr movdqu
%define unroll 2
%endif %endif
movsxdifnidn dstWq, dstWd movsxdifnidn dstWq, dstWd
movsxdifnidn offsetq, offsetd movsxdifnidn offsetq, offsetd
@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
.outerloop: .outerloop:
mova m4, m7 mova m4, m7
mova m3, m7 mova m3, m7
%if cpuflag(sse3)
mova m6, m7 mova m6, m7
mova m1, m7 mova m1, m7
%endif
.loop: .loop:
%if cpuflag(avx2) %if cpuflag(avx2)
vpbroadcastq m0, [filterSizeq + 8] vpbroadcastq m0, [filterSizeq + 8]
@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + mmsize]
paddw m3, m3, m2 paddw m3, m3, m2
paddw m4, m4, m5 paddw m4, m4, m5
%if cpuflag(sse3)
pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
paddw m6, m6, m2 paddw m6, m6, m2
paddw m1, m1, m5 paddw m1, m1, m5
%endif
add filterSizeq, $10 add filterSizeq, $10
mov srcq, [filterSizeq] mov srcq, [filterSizeq]
test srcq, srcq test srcq, srcq
jnz .loop jnz .loop
psraw m3, m3, 3 psraw m3, m3, 3
psraw m4, m4, 3 psraw m4, m4, 3
%if cpuflag(sse3)
psraw m6, m6, 3 psraw m6, m6, 3
psraw m1, m1, 3 psraw m1, m1, 3
%endif
packuswb m3, m3, m4 packuswb m3, m3, m4
%if cpuflag(sse3)
packuswb m6, m6, m1 packuswb m6, m6, m1
%endif
mov srcq, [filterq] mov srcq, [filterq]
%if cpuflag(avx2) %if cpuflag(avx2)
vpermq m3, m3, 216 vpermq m3, m3, 216
vpermq m6, m6, 216 vpermq m6, m6, 216
%endif %endif
movr [destq + offsetq], m3 movr [destq + offsetq], m3
%if cpuflag(sse3)
movr [destq + offsetq + mmsize], m6 movr [destq + offsetq + mmsize], m6
add offsetq, mmsize * 2 %endif
add offsetq, mmsize * unroll
mov filterSizeq, filterq mov filterSizeq, filterq
cmp offsetq, dstWq cmp offsetq, dstWq
jb .outerloop jb .outerloop