mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2025-01-28 02:02:46 +00:00
x86inc: activate REP_RET automatically
Now RET checks whether it immediately follows a branch, so the programmer dosen't have to keep track of that condition. REP_RET is still needed manually when it's a branch target, but that's much rarer. The implementation involves lots of spurious labels, but that's OK because we strip them. Signed-off-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
This commit is contained in:
parent
ce1e8045e0
commit
25cb0c1a1e
@ -135,8 +135,7 @@ CPUNOP amdnop
|
|||||||
; Pops anything that was pushed by PROLOGUE, and returns.
|
; Pops anything that was pushed by PROLOGUE, and returns.
|
||||||
|
|
||||||
; REP_RET:
|
; REP_RET:
|
||||||
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
|
; Use this instead of RET if it's a branch target.
|
||||||
; which are slow when a normal ret follows a branch.
|
|
||||||
|
|
||||||
; registers:
|
; registers:
|
||||||
; rN and rNq are the native-size register holding function argument N
|
; rN and rNq are the native-size register holding function argument N
|
||||||
@ -484,7 +483,7 @@ DECLARE_REG 14, R15, 120
|
|||||||
%if mmsize == 32
|
%if mmsize == 32
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
ret
|
AUTO_REP_RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%elif ARCH_X86_64 ; *nix x64 ;=============================================
|
%elif ARCH_X86_64 ; *nix x64 ;=============================================
|
||||||
@ -531,7 +530,7 @@ DECLARE_REG 14, R15, 72
|
|||||||
%if mmsize == 32
|
%if mmsize == 32
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
ret
|
AUTO_REP_RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%else ; X86_32 ;==============================================================
|
%else ; X86_32 ;==============================================================
|
||||||
@ -587,7 +586,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||||||
%if mmsize == 32
|
%if mmsize == 32
|
||||||
vzeroupper
|
vzeroupper
|
||||||
%endif
|
%endif
|
||||||
ret
|
AUTO_REP_RET
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%endif ;======================================================================
|
%endif ;======================================================================
|
||||||
@ -601,6 +600,10 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||||||
%endmacro
|
%endmacro
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
|
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
|
||||||
|
; a branch or a branch target. So switch to a 2-byte form of ret in that case.
|
||||||
|
; We can automatically detect "follows a branch", but not a branch target.
|
||||||
|
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
|
||||||
%macro REP_RET 0
|
%macro REP_RET 0
|
||||||
%if has_epilogue
|
%if has_epilogue
|
||||||
RET
|
RET
|
||||||
@ -609,6 +612,29 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
|
%define last_branch_adr $$
|
||||||
|
%macro AUTO_REP_RET 0
|
||||||
|
%ifndef cpuflags
|
||||||
|
times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
|
||||||
|
%elif notcpuflag(ssse3)
|
||||||
|
times ((last_branch_adr-$)>>31)+1 rep
|
||||||
|
%endif
|
||||||
|
ret
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro BRANCH_INSTR 0-*
|
||||||
|
%rep %0
|
||||||
|
%macro %1 1-2 %1
|
||||||
|
%2 %1
|
||||||
|
%%branch_instr:
|
||||||
|
%xdefine last_branch_adr %%branch_instr
|
||||||
|
%endmacro
|
||||||
|
%rotate 1
|
||||||
|
%endrep
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
|
||||||
|
|
||||||
%macro TAIL_CALL 2 ; callee, is_nonadjacent
|
%macro TAIL_CALL 2 ; callee, is_nonadjacent
|
||||||
%if has_epilogue
|
%if has_epilogue
|
||||||
call %1
|
call %1
|
||||||
|
Loading…
Reference in New Issue
Block a user