mirror of
git://git.musl-libc.org/musl
synced 2025-01-12 09:39:39 +00:00
overhaul optimized x86_64 memset asm
on most cpu models, "rep stosq" has high overhead that makes it undesirable for small memset sizes. the new code extends the minimal-branch fast path for short memsets from size 15 up to size 126, and shrink-wraps this code path. in addition, "rep stosq" is sensitive to misalignment. the cost varies with size and with cpu model, but it has been observed performing 1.5 times slower when the destination address is not aligned mod 16. the new code thus ensures alignment mod 16, but also preserves any existing additional alignment, in case there are cpu models where it is beneficial. this version is based in part on changes proposed by Denys Vlasenko.
This commit is contained in:
parent
69858fa931
commit
e346ff86c8
@ -1,43 +1,72 @@
|
||||
.global memset
|
||||
.type memset,@function
|
||||
memset:
|
||||
movzbl %sil,%esi
|
||||
mov $0x101010101010101,%rax
|
||||
# 64-bit imul has 3-7 cycles latency, launch early
|
||||
imul %rsi,%rax
|
||||
movzbq %sil,%rax
|
||||
mov $0x101010101010101,%r8
|
||||
imul %r8,%rax
|
||||
|
||||
cmp $16,%rdx
|
||||
jb 1f
|
||||
cmp $126,%rdx
|
||||
ja 2f
|
||||
|
||||
lea -1(%rdx),%rcx
|
||||
test %edx,%edx
|
||||
jz 1f
|
||||
|
||||
mov %sil,(%rdi)
|
||||
mov %sil,-1(%rdi,%rdx)
|
||||
cmp $2,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %ax,1(%rdi)
|
||||
mov %ax,(-1-2)(%rdi,%rdx)
|
||||
cmp $6,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %eax,(1+2)(%rdi)
|
||||
mov %eax,(-1-2-4)(%rdi,%rdx)
|
||||
cmp $14,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %rax,(1+2+4)(%rdi)
|
||||
mov %rax,(-1-2-4-8)(%rdi,%rdx)
|
||||
cmp $30,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %rax,(1+2+4+8)(%rdi)
|
||||
mov %rax,(1+2+4+8+8)(%rdi)
|
||||
mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
|
||||
mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
|
||||
cmp $62,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %rax,(1+2+4+8+16)(%rdi)
|
||||
mov %rax,(1+2+4+8+16+8)(%rdi)
|
||||
mov %rax,(1+2+4+8+16+16)(%rdi)
|
||||
mov %rax,(1+2+4+8+16+24)(%rdi)
|
||||
mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
|
||||
mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
|
||||
mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
|
||||
mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
|
||||
|
||||
1: mov %rdi,%rax
|
||||
ret
|
||||
|
||||
2: test $15,%edi
|
||||
mov %rdi,%r8
|
||||
shr $3,%rcx
|
||||
mov %rax,-8(%rdi,%rdx)
|
||||
mov %rdx,%rcx
|
||||
jnz 2f
|
||||
|
||||
1: shr $3,%rcx
|
||||
rep
|
||||
stosq
|
||||
mov %r8,%rax
|
||||
ret
|
||||
|
||||
1: test %edx,%edx
|
||||
jz 1f
|
||||
|
||||
mov %al,(%rdi)
|
||||
mov %al,-1(%rdi,%rdx)
|
||||
cmp $2,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %al,1(%rdi)
|
||||
mov %al,-2(%rdi,%rdx)
|
||||
cmp $4,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %eax,(%rdi)
|
||||
mov %eax,-4(%rdi,%rdx)
|
||||
cmp $8,%edx
|
||||
jbe 1f
|
||||
|
||||
mov %eax,4(%rdi)
|
||||
mov %eax,-8(%rdi,%rdx)
|
||||
|
||||
1: mov %rdi,%rax
|
||||
ret
|
||||
2: xor %edx,%edx
|
||||
sub %edi,%edx
|
||||
and $15,%edx
|
||||
mov %rax,(%rdi)
|
||||
mov %rax,8(%rdi)
|
||||
sub %rdx,%rcx
|
||||
add %rdx,%rdi
|
||||
jmp 1b
|
||||
|
Loading…
Reference in New Issue
Block a user