mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-27 18:02:11 +00:00
bbe95f7353
From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
395 lines
9.6 KiB
NASM
395 lines
9.6 KiB
NASM
;******************************************************************************
|
|
;* SIMD lossless video DSP utils
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2014 Michael Niedermayer
|
|
;* Copyright (c) 2017 Jokyo Images
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION_RODATA
|
|
|
|
cextern pb_15
|
|
pb_zzzzzzzz77777777: times 8 db -1
|
|
pb_7: times 8 db 7
|
|
pb_ef: times 8 db 14,15
|
|
pb_67: times 8 db 6, 7
|
|
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
|
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
|
pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
|
|
pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
|
|
|
|
SECTION .text
|
|
|
|
;------------------------------------------------------------------------------
|
|
; void ff_add_median_pred(uint8_t *dst, const uint8_t *top,
|
|
; const uint8_t *diff, int w,
|
|
; int *left, int *left_top)
|
|
;------------------------------------------------------------------------------
|
|
INIT_XMM sse2
|
|
cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
|
|
movu m0, [topq]
|
|
mova m2, m0
|
|
movd m4, [left_topq]
|
|
LSHIFT m2, 1
|
|
mova m1, m0
|
|
por m4, m2
|
|
movd m3, [leftq]
|
|
psubb m0, m4 ; t-tl
|
|
add dstq, wq
|
|
add topq, wq
|
|
add diffq, wq
|
|
neg wq
|
|
jmp .skip
|
|
.loop:
|
|
movu m4, [topq+wq]
|
|
mova m0, m4
|
|
LSHIFT m4, 1
|
|
por m4, m1
|
|
mova m1, m0 ; t
|
|
psubb m0, m4 ; t-tl
|
|
.skip:
|
|
movu m2, [diffq+wq]
|
|
%assign i 0
|
|
%rep mmsize
|
|
mova m4, m0
|
|
paddb m4, m3 ; t-tl+l
|
|
mova m5, m3
|
|
pmaxub m3, m1
|
|
pminub m5, m1
|
|
pminub m3, m4
|
|
pmaxub m3, m5 ; median
|
|
paddb m3, m2 ; +residual
|
|
%if i==0
|
|
mova m7, m3
|
|
LSHIFT m7, mmsize-1
|
|
%else
|
|
mova m6, m3
|
|
RSHIFT m7, 1
|
|
LSHIFT m6, mmsize-1
|
|
por m7, m6
|
|
%endif
|
|
%if i<mmsize-1
|
|
RSHIFT m0, 1
|
|
RSHIFT m1, 1
|
|
RSHIFT m2, 1
|
|
%endif
|
|
%assign i i+1
|
|
%endrep
|
|
movu [dstq+wq], m7
|
|
add wq, mmsize
|
|
jl .loop
|
|
movzx r2d, byte [dstq-1]
|
|
mov [leftq], r2d
|
|
movzx r2d, byte [topq-1]
|
|
mov [left_topq], r2d
|
|
RET
|
|
|
|
|
|
%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
|
|
add srcq, wq
|
|
add dstq, wq
|
|
neg wq
|
|
%%.loop:
|
|
pshufb xm0, xm5
|
|
%if %2
|
|
mova m1, [srcq+wq]
|
|
%else
|
|
movu m1, [srcq+wq]
|
|
%endif
|
|
psllw m2, m1, 8
|
|
paddb m1, m2
|
|
pshufb m2, m1, m3
|
|
paddb m1, m2
|
|
pshufb m2, m1, m4
|
|
paddb m1, m2
|
|
%if mmsize >= 16
|
|
pshufb m2, m1, m6
|
|
paddb m1, m2
|
|
%endif
|
|
paddb xm0, xm1
|
|
%if %1
|
|
mova [dstq+wq], xm0
|
|
%else
|
|
movq [dstq+wq], xm0
|
|
movhps [dstq+wq+8], xm0
|
|
%endif
|
|
|
|
%if mmsize == 32
|
|
vextracti128 xm2, m1, 1 ; get second lane of the ymm
|
|
pshufb xm0, xm5 ; set alls val to last val of the first lane
|
|
paddb xm0, xm2
|
|
;store val
|
|
%if %1
|
|
mova [dstq+wq+16], xm0
|
|
%else;
|
|
movq [dstq+wq+16], xm0
|
|
movhps [dstq+wq+16+8], xm0
|
|
%endif
|
|
%endif
|
|
add wq, mmsize
|
|
jl %%.loop
|
|
%if mmsize == 32
|
|
movzx eax, byte [dstq - 1]
|
|
%else;
|
|
mov eax, mmsize-1
|
|
sub eax, wd
|
|
movd m1, eax
|
|
pshufb m0, m1
|
|
movd eax, m0
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
;------------------------------------------------------------------------------
|
|
; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
|
|
;------------------------------------------------------------------------------
|
|
INIT_MMX ssse3
|
|
cglobal add_left_pred, 3,3,7, dst, src, w, left
|
|
.skip_prologue:
|
|
mova m5, [pb_7]
|
|
mova m4, [pb_zzzz3333zzzzbbbb]
|
|
mova m3, [pb_zz11zz55zz99zzdd]
|
|
movd m0, leftm
|
|
psllq m0, 56
|
|
ADD_LEFT_LOOP 1, 1
|
|
|
|
%macro ADD_LEFT_PRED_UNALIGNED 0
|
|
cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
|
|
mova xm5, [pb_15]
|
|
VBROADCASTI128 m6, [pb_zzzzzzzz77777777]
|
|
VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb]
|
|
VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd]
|
|
movd xm0, leftm
|
|
pslldq xm0, 15
|
|
test srcq, mmsize - 1
|
|
jnz .src_unaligned
|
|
test dstq, mmsize - 1
|
|
jnz .dst_unaligned
|
|
ADD_LEFT_LOOP 1, 1
|
|
.dst_unaligned:
|
|
ADD_LEFT_LOOP 0, 1
|
|
.src_unaligned:
|
|
ADD_LEFT_LOOP 0, 0
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
ADD_LEFT_PRED_UNALIGNED
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_LEFT_PRED_UNALIGNED
|
|
%endif
|
|
|
|
;------------------------------------------------------------------------------
|
|
; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w);
|
|
;------------------------------------------------------------------------------
|
|
%macro ADD_BYTES 0
|
|
cglobal add_bytes, 3,4,2, dst, src, w, size
|
|
mov sizeq, wq
|
|
and sizeq, -2*mmsize
|
|
jz .2
|
|
add dstq, sizeq
|
|
add srcq, sizeq
|
|
neg sizeq
|
|
.1:
|
|
mova m0, [srcq + sizeq]
|
|
mova m1, [srcq + sizeq + mmsize]
|
|
paddb m0, [dstq + sizeq]
|
|
paddb m1, [dstq + sizeq + mmsize]
|
|
mova [dstq + sizeq], m0
|
|
mova [dstq + sizeq + mmsize], m1
|
|
add sizeq, 2*mmsize
|
|
jl .1
|
|
.2:
|
|
and wq, 2*mmsize-1
|
|
jz .end
|
|
add dstq, wq
|
|
add srcq, wq
|
|
neg wq
|
|
.3:
|
|
mov sizeb, [srcq + wq]
|
|
add [dstq + wq], sizeb
|
|
inc wq
|
|
jl .3
|
|
.end:
|
|
RET
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
ADD_BYTES
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_BYTES
|
|
%endif
|
|
|
|
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
|
|
add wd, wd
|
|
add srcq, wq
|
|
add dstq, wq
|
|
neg wq
|
|
%%.loop:
|
|
mov%2 m1, [srcq+wq]
|
|
mova m2, m1
|
|
pslld m1, 16
|
|
paddw m1, m2
|
|
mova m2, m1
|
|
|
|
pshufb m1, m3
|
|
paddw m1, m2
|
|
pshufb m0, m5
|
|
%if mmsize == 16
|
|
mova m2, m1
|
|
pshufb m1, m4
|
|
paddw m1, m2
|
|
%endif
|
|
paddw m0, m1
|
|
pand m0, m7
|
|
%ifidn %1, a
|
|
mova [dstq+wq], m0
|
|
%else
|
|
movq [dstq+wq], m0
|
|
movhps [dstq+wq+8], m0
|
|
%endif
|
|
add wq, mmsize
|
|
jl %%.loop
|
|
mov eax, mmsize-1
|
|
sub eax, wd
|
|
mov wd, eax
|
|
shl wd, 8
|
|
lea eax, [wd+eax-1]
|
|
movd m1, eax
|
|
pshufb m0, m1
|
|
movd eax, m0
|
|
RET
|
|
%endmacro
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
|
|
;---------------------------------------------------------------------------------------------
|
|
INIT_MMX ssse3
|
|
cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
|
|
.skip_prologue:
|
|
mova m5, [pb_67]
|
|
mova m3, [pb_zzzz2323zzzzabab]
|
|
movd m0, leftm
|
|
psllq m0, 48
|
|
movd m7, maskm
|
|
SPLATW m7 ,m7
|
|
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
|
|
|
INIT_XMM ssse3
|
|
cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
|
|
mova m5, [pb_ef]
|
|
mova m4, [pb_zzzzzzzz67676767]
|
|
mova m3, [pb_zzzz2323zzzzabab]
|
|
movd m0, leftm
|
|
pslldq m0, 14
|
|
movd m7, maskm
|
|
SPLATW m7 ,m7
|
|
test srcq, 15
|
|
jnz .src_unaligned
|
|
test dstq, 15
|
|
jnz .dst_unaligned
|
|
ADD_HFYU_LEFT_LOOP_INT16 a, a
|
|
.dst_unaligned:
|
|
ADD_HFYU_LEFT_LOOP_INT16 u, a
|
|
.src_unaligned:
|
|
ADD_HFYU_LEFT_LOOP_INT16 u, u
|
|
|
|
|
|
;---------------------------------------------------------------------------------------------
|
|
; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width)
|
|
;---------------------------------------------------------------------------------------------
|
|
%macro ADD_GRADIENT_PRED 0
|
|
cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp
|
|
mova xm0, [pb_15]
|
|
|
|
;load src - 1 in xm1
|
|
movd xm1, [srcq-1]
|
|
%if cpuflag(avx2)
|
|
vpbroadcastb xm1, xm1
|
|
%else
|
|
pxor xm2, xm2
|
|
pshufb xm1, xm2
|
|
%endif
|
|
|
|
add srcq, widthq
|
|
neg widthq
|
|
neg strideq
|
|
|
|
.loop:
|
|
lea tmpq, [srcq + strideq]
|
|
mova m2, [tmpq + widthq] ; A = src[x-stride]
|
|
movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)]
|
|
mova m4, [srcq + widthq] ; current val (src[x])
|
|
|
|
psubb m2, m3; A - B
|
|
|
|
; prefix sum A-B
|
|
pslldq m3, m2, 1
|
|
paddb m2, m3
|
|
pslldq m3, m2, 2
|
|
paddb m2, m3
|
|
pslldq m3, m2, 4
|
|
paddb m2, m3
|
|
pslldq m3, m2, 8
|
|
paddb m2, m3
|
|
|
|
; prefix sum current val
|
|
pslldq m3, m4, 1
|
|
paddb m4, m3
|
|
pslldq m3, m4, 2
|
|
paddb m4, m3
|
|
pslldq m3, m4, 4
|
|
paddb m4, m3
|
|
pslldq m3, m4, 8
|
|
paddb m4, m3
|
|
|
|
; last sum
|
|
paddb m2, m4 ; current + (A - B)
|
|
|
|
paddb xm1, xm2 ; += C
|
|
mova [srcq + widthq], xm1 ; store
|
|
|
|
pshufb xm1, xm0 ; put last val in all val of xm1
|
|
|
|
%if mmsize == 32
|
|
vextracti128 xm2, m2, 1 ; get second lane of the ymm
|
|
paddb xm1, xm2; += C
|
|
|
|
mova [srcq + widthq + 16], xm1 ; store
|
|
pshufb xm1, xm0 ; put last val in all val of m1
|
|
%endif
|
|
|
|
add widthq, mmsize
|
|
jl .loop
|
|
RET
|
|
|
|
%endmacro
|
|
|
|
INIT_XMM ssse3
|
|
ADD_GRADIENT_PRED
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_GRADIENT_PRED
|
|
%endif
|