mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-19 05:40:56 +00:00
bbe95f7353
From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
144 lines
3.8 KiB
NASM
144 lines
3.8 KiB
NASM
;******************************************************************************
|
|
;* SIMD-optimized HuffYUV functions
|
|
;* Copyright (c) 2008 Loren Merritt
|
|
;* Copyright (c) 2014 Christophe Gisquet
|
|
;*
|
|
;* This file is part of FFmpeg.
|
|
;*
|
|
;* FFmpeg is free software; you can redistribute it and/or
|
|
;* modify it under the terms of the GNU Lesser General Public
|
|
;* License as published by the Free Software Foundation; either
|
|
;* version 2.1 of the License, or (at your option) any later version.
|
|
;*
|
|
;* FFmpeg is distributed in the hope that it will be useful,
|
|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;* Lesser General Public License for more details.
|
|
;*
|
|
;* You should have received a copy of the GNU Lesser General Public
|
|
;* License along with FFmpeg; if not, write to the Free Software
|
|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
;******************************************************************************
|
|
|
|
%include "libavutil/x86/x86util.asm"
|
|
|
|
SECTION .text
|
|
|
|
%include "libavcodec/x86/huffyuvdsp_template.asm"
|
|
|
|
;------------------------------------------------------------------------------
|
|
; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
|
|
;------------------------------------------------------------------------------
|
|
|
|
%macro ADD_INT16 0
|
|
cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
|
|
test srcq, mmsize-1
|
|
jnz .unaligned
|
|
test dstq, mmsize-1
|
|
jnz .unaligned
|
|
INT16_LOOP a, add
|
|
.unaligned:
|
|
INT16_LOOP u, add
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
ADD_INT16
|
|
|
|
%if HAVE_AVX2_EXTERNAL
|
|
INIT_YMM avx2
|
|
ADD_INT16
|
|
%endif
|
|
|
|
; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
|
|
; intptr_t w, uint8_t *left)
|
|
INIT_XMM sse2
|
|
cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
|
|
shl wq, 2
|
|
movd m0, [leftq]
|
|
lea dstq, [dstq + wq]
|
|
lea srcq, [srcq + wq]
|
|
LSHIFT m0, mmsize-4
|
|
neg wq
|
|
.loop:
|
|
movu m1, [srcq+wq]
|
|
mova m2, m1
|
|
LSHIFT m1, 4
|
|
paddb m1, m2
|
|
pshufd m0, m0, q3333
|
|
mova m2, m1
|
|
LSHIFT m1, 8
|
|
paddb m1, m2
|
|
paddb m0, m1
|
|
movu [dstq+wq], m0
|
|
add wq, mmsize
|
|
jl .loop
|
|
movd m0, [dstq-4]
|
|
movd [leftq], m0
|
|
RET
|
|
|
|
|
|
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
|
|
INIT_MMX mmxext
|
|
cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
|
|
add wd, wd
|
|
movd mm6, maskd
|
|
SPLATW mm6, mm6
|
|
movq mm0, [topq]
|
|
movq mm2, mm0
|
|
movd mm4, [left_topq]
|
|
psllq mm2, 16
|
|
movq mm1, mm0
|
|
por mm4, mm2
|
|
movd mm3, [leftq]
|
|
psubw mm0, mm4 ; t-tl
|
|
add dstq, wq
|
|
add topq, wq
|
|
add diffq, wq
|
|
neg wq
|
|
jmp .skip
|
|
.loop:
|
|
movq mm4, [topq+wq]
|
|
movq mm0, mm4
|
|
psllq mm4, 16
|
|
por mm4, mm1
|
|
movq mm1, mm0 ; t
|
|
psubw mm0, mm4 ; t-tl
|
|
.skip:
|
|
movq mm2, [diffq+wq]
|
|
%assign i 0
|
|
%rep 4
|
|
movq mm4, mm0
|
|
paddw mm4, mm3 ; t-tl+l
|
|
pand mm4, mm6
|
|
movq mm5, mm3
|
|
pmaxsw mm3, mm1
|
|
pminsw mm5, mm1
|
|
pminsw mm3, mm4
|
|
pmaxsw mm3, mm5 ; median
|
|
paddw mm3, mm2 ; +residual
|
|
pand mm3, mm6
|
|
%if i==0
|
|
movq mm7, mm3
|
|
psllq mm7, 48
|
|
%else
|
|
movq mm4, mm3
|
|
psrlq mm7, 16
|
|
psllq mm4, 48
|
|
por mm7, mm4
|
|
%endif
|
|
%if i<3
|
|
psrlq mm0, 16
|
|
psrlq mm1, 16
|
|
psrlq mm2, 16
|
|
%endif
|
|
%assign i i+1
|
|
%endrep
|
|
movq [dstq+wq], mm7
|
|
add wq, 8
|
|
jl .loop
|
|
movzx r2d, word [dstq-2]
|
|
mov [leftq], r2d
|
|
movzx r2d, word [topq-2]
|
|
mov [left_topq], r2d
|
|
RET
|