mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-02-18 21:07:01 +00:00
avutil/pixelutils: faster pixelutils_sad_[au]_16x16
~560 → ~500 decicycles This is following the comments from Michael in https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html Using 2 registers for accumulator didn't help. On the other hand, some re-ordering between the movs and psadbw allowed going ~538 to ~500.
This commit is contained in:
parent
c82a288f87
commit
45c7f3997e
@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2
|
|||||||
%macro SAD_XMM_16x16 1
|
%macro SAD_XMM_16x16 1
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
|
cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2
|
||||||
pxor m2, m2
|
mov%1 m2, [src2q]
|
||||||
%rep 8
|
psadbw m2, [src1q]
|
||||||
mov%1 m0, [src2q]
|
|
||||||
mov%1 m1, [src2q + stride2q]
|
mov%1 m1, [src2q + stride2q]
|
||||||
|
psadbw m1, [src1q + stride1q]
|
||||||
|
paddw m2, m1
|
||||||
|
%rep 7
|
||||||
|
lea src1q, [src1q + 2*stride1q]
|
||||||
|
lea src2q, [src2q + 2*stride2q]
|
||||||
|
mov%1 m0, [src2q]
|
||||||
psadbw m0, [src1q]
|
psadbw m0, [src1q]
|
||||||
|
mov%1 m1, [src2q + stride2q]
|
||||||
psadbw m1, [src1q + stride1q]
|
psadbw m1, [src1q + stride1q]
|
||||||
paddw m2, m0
|
paddw m2, m0
|
||||||
paddw m2, m1
|
paddw m2, m1
|
||||||
lea src1q, [src1q + 2*stride1q]
|
|
||||||
lea src2q, [src2q + 2*stride2q]
|
|
||||||
%endrep
|
%endrep
|
||||||
movhlps m0, m2
|
movhlps m0, m2
|
||||||
paddw m2, m0
|
paddw m2, m0
|
||||||
|
Loading…
Reference in New Issue
Block a user