mirror of
https://git.ffmpeg.org/ffmpeg.git
synced 2024-12-28 02:12:28 +00:00
Merge commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450'
* commit 'bbe4a6db44f0b55b424a5cc9d3e89cd88e250450': x86inc: Utilize the shadow space on 64-bit Windows Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
1f17619fe4
@ -672,13 +672,13 @@ cglobal imdct_calc, 3,5,3
|
||||
push r1
|
||||
push r0
|
||||
%else
|
||||
sub rsp, 8
|
||||
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
|
||||
%endif
|
||||
call r4
|
||||
%if ARCH_X86_32
|
||||
add esp, 12
|
||||
%else
|
||||
add rsp, 8
|
||||
add rsp, 8+32*WIN64
|
||||
%endif
|
||||
POP r1
|
||||
POP r3
|
||||
|
@ -331,16 +331,14 @@ cglobal deblock_v_luma_8, 5,5,10
|
||||
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
|
||||
;-----------------------------------------------------------------------------
|
||||
INIT_MMX cpuname
|
||||
cglobal deblock_h_luma_8, 5,9
|
||||
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
|
||||
movsxd r7, r1d
|
||||
lea r8, [r7+r7*2]
|
||||
lea r6, [r0-4]
|
||||
lea r5, [r0-4+r8]
|
||||
%if WIN64
|
||||
sub rsp, 0x98
|
||||
%define pix_tmp rsp+0x30
|
||||
%define pix_tmp rsp+0x30 ; shadow space + r4
|
||||
%else
|
||||
sub rsp, 0x68
|
||||
%define pix_tmp rsp
|
||||
%endif
|
||||
|
||||
@ -379,11 +377,6 @@ cglobal deblock_h_luma_8, 5,9
|
||||
movq m3, [pix_tmp+0x40]
|
||||
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
|
||||
|
||||
%if WIN64
|
||||
add rsp, 0x98
|
||||
%else
|
||||
add rsp, 0x68
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
@ -708,13 +701,16 @@ INIT_MMX cpuname
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_luma_intra_8, 4,9
|
||||
cglobal deblock_h_luma_intra_8, 4,9,0,0x80
|
||||
movsxd r7, r1d
|
||||
lea r8, [r7*3]
|
||||
lea r6, [r0-4]
|
||||
lea r5, [r0-4+r8]
|
||||
sub rsp, 0x88
|
||||
%if WIN64
|
||||
%define pix_tmp rsp+0x20 ; shadow space
|
||||
%else
|
||||
%define pix_tmp rsp
|
||||
%endif
|
||||
|
||||
; transpose 8x16 -> tmp space
|
||||
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
|
||||
@ -734,7 +730,6 @@ cglobal deblock_h_luma_intra_8, 4,9
|
||||
sub r5, r7
|
||||
shr r7, 3
|
||||
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
|
||||
add rsp, 0x88
|
||||
RET
|
||||
%else
|
||||
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
|
||||
|
@ -353,14 +353,18 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
||||
%if stack_size < 0
|
||||
%assign stack_size -stack_size
|
||||
%endif
|
||||
%if mmsize != 8
|
||||
%assign xmm_regs_used %2
|
||||
%assign stack_size_padded stack_size
|
||||
%if WIN64
|
||||
%assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
|
||||
%if mmsize != 8
|
||||
%assign xmm_regs_used %2
|
||||
%if xmm_regs_used > 8
|
||||
%assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
|
||||
%endif
|
||||
%endif
|
||||
%endif
|
||||
%if mmsize <= 16 && HAVE_ALIGNED_STACK
|
||||
%assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
|
||||
%if xmm_regs_used > 6
|
||||
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
|
||||
%endif
|
||||
%assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
|
||||
SUB rsp, stack_size_padded
|
||||
%else
|
||||
%assign %%reg_num (regs_used - 1)
|
||||
@ -370,14 +374,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
||||
; stack in a single instruction (i.e. mov rsp, rstk or mov
|
||||
; rsp, [rsp+stack_size_padded])
|
||||
mov rstk, rsp
|
||||
%assign stack_size_padded stack_size
|
||||
%if xmm_regs_used > 6
|
||||
%assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
|
||||
%if mmsize == 32 && xmm_regs_used & 1
|
||||
; re-align to 32 bytes
|
||||
%assign stack_size_padded (stack_size_padded + 16)
|
||||
%endif
|
||||
%endif
|
||||
%if %1 < 0 ; need to store rsp on stack
|
||||
sub rsp, gprsize+stack_size_padded
|
||||
and rsp, ~(%%stack_alignment-1)
|
||||
@ -389,9 +385,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
||||
%xdefine rstkm rstk
|
||||
%endif
|
||||
%endif
|
||||
%if xmm_regs_used > 6
|
||||
WIN64_PUSH_XMM
|
||||
%endif
|
||||
WIN64_PUSH_XMM
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
@ -452,40 +446,55 @@ DECLARE_REG 14, R15, 120
|
||||
%endmacro
|
||||
|
||||
%macro WIN64_PUSH_XMM 0
|
||||
%assign %%i xmm_regs_used
|
||||
%rep (xmm_regs_used-6)
|
||||
%assign %%i %%i-1
|
||||
movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
|
||||
%endrep
|
||||
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
|
||||
%if xmm_regs_used > 6
|
||||
movaps [rstk + stack_offset + 8], xmm6
|
||||
%endif
|
||||
%if xmm_regs_used > 7
|
||||
movaps [rstk + stack_offset + 24], xmm7
|
||||
%endif
|
||||
%if xmm_regs_used > 8
|
||||
%assign %%i 8
|
||||
%rep xmm_regs_used-8
|
||||
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WIN64_SPILL_XMM 1
|
||||
%assign xmm_regs_used %1
|
||||
ASSERT xmm_regs_used <= 16
|
||||
%if xmm_regs_used > 6
|
||||
SUB rsp, (xmm_regs_used-6)*16+16
|
||||
WIN64_PUSH_XMM
|
||||
%if xmm_regs_used > 8
|
||||
%assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
|
||||
SUB rsp, stack_size_padded
|
||||
%endif
|
||||
WIN64_PUSH_XMM
|
||||
%endmacro
|
||||
|
||||
%macro WIN64_RESTORE_XMM_INTERNAL 1
|
||||
%if xmm_regs_used > 6
|
||||
%assign %%pad_size 0
|
||||
%if xmm_regs_used > 8
|
||||
%assign %%i xmm_regs_used
|
||||
%rep (xmm_regs_used-6)
|
||||
%rep xmm_regs_used-8
|
||||
%assign %%i %%i-1
|
||||
movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
|
||||
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
|
||||
%endrep
|
||||
%if stack_size_padded == 0
|
||||
add %1, (xmm_regs_used-6)*16+16
|
||||
%endif
|
||||
%endif
|
||||
%if stack_size_padded > 0
|
||||
%if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
|
||||
mov rsp, rstkm
|
||||
%else
|
||||
add %1, stack_size_padded
|
||||
%assign %%pad_size stack_size_padded
|
||||
%endif
|
||||
%endif
|
||||
%if xmm_regs_used > 7
|
||||
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
|
||||
%endif
|
||||
%if xmm_regs_used > 6
|
||||
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WIN64_RESTORE_XMM 1
|
||||
@ -702,12 +711,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
|
||||
%endif
|
||||
align function_align
|
||||
%2:
|
||||
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
|
||||
%xdefine rstk rsp
|
||||
%assign stack_offset 0
|
||||
%assign stack_size 0
|
||||
%assign stack_size_padded 0
|
||||
%assign xmm_regs_used 0
|
||||
RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
|
||||
%xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
|
||||
%assign stack_offset 0 ; stack pointer offset relative to the return address
|
||||
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
|
||||
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
|
||||
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
|
||||
%ifnidn %3, ""
|
||||
PROLOGUE %3
|
||||
%endif
|
||||
|
Loading…
Reference in New Issue
Block a user