|
@@ -1,43 +1,72 @@
|
|
|
.global memset
|
|
|
.type memset,@function
|
|
|
memset:
|
|
|
- movzbl %sil,%esi
|
|
|
- mov $0x101010101010101,%rax
|
|
|
- # 64-bit imul has 3-7 cycles latency, launch early
|
|
|
- imul %rsi,%rax
|
|
|
+ movzbq %sil,%rax
|
|
|
+ mov $0x101010101010101,%r8
|
|
|
+ imul %r8,%rax
|
|
|
|
|
|
- cmp $16,%rdx
|
|
|
- jb 1f
|
|
|
+ cmp $126,%rdx
|
|
|
+ ja 2f
|
|
|
|
|
|
- lea -1(%rdx),%rcx
|
|
|
- mov %rdi,%r8
|
|
|
- shr $3,%rcx
|
|
|
- mov %rax,-8(%rdi,%rdx)
|
|
|
- rep
|
|
|
- stosq
|
|
|
- mov %r8,%rax
|
|
|
- ret
|
|
|
-
|
|
|
-1: test %edx,%edx
|
|
|
+ test %edx,%edx
|
|
|
jz 1f
|
|
|
|
|
|
- mov %al,(%rdi)
|
|
|
- mov %al,-1(%rdi,%rdx)
|
|
|
+ mov %sil,(%rdi)
|
|
|
+ mov %sil,-1(%rdi,%rdx)
|
|
|
cmp $2,%edx
|
|
|
jbe 1f
|
|
|
|
|
|
- mov %al,1(%rdi)
|
|
|
- mov %al,-2(%rdi,%rdx)
|
|
|
- cmp $4,%edx
|
|
|
+ mov %ax,1(%rdi)
|
|
|
+ mov %ax,(-1-2)(%rdi,%rdx)
|
|
|
+ cmp $6,%edx
|
|
|
+ jbe 1f
|
|
|
+
|
|
|
+ mov %eax,(1+2)(%rdi)
|
|
|
+ mov %eax,(-1-2-4)(%rdi,%rdx)
|
|
|
+ cmp $14,%edx
|
|
|
+ jbe 1f
|
|
|
+
|
|
|
+ mov %rax,(1+2+4)(%rdi)
|
|
|
+ mov %rax,(-1-2-4-8)(%rdi,%rdx)
|
|
|
+ cmp $30,%edx
|
|
|
jbe 1f
|
|
|
|
|
|
- mov %eax,(%rdi)
|
|
|
- mov %eax,-4(%rdi,%rdx)
|
|
|
- cmp $8,%edx
|
|
|
+ mov %rax,(1+2+4+8)(%rdi)
|
|
|
+ mov %rax,(1+2+4+8+8)(%rdi)
|
|
|
+ mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
|
|
|
+ mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
|
|
|
+ cmp $62,%edx
|
|
|
jbe 1f
|
|
|
|
|
|
- mov %eax,4(%rdi)
|
|
|
- mov %eax,-8(%rdi,%rdx)
|
|
|
+ mov %rax,(1+2+4+8+16)(%rdi)
|
|
|
+ mov %rax,(1+2+4+8+16+8)(%rdi)
|
|
|
+ mov %rax,(1+2+4+8+16+16)(%rdi)
|
|
|
+ mov %rax,(1+2+4+8+16+24)(%rdi)
|
|
|
+ mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
|
|
|
+ mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
|
|
|
+ mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
|
|
|
+ mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
|
|
|
|
|
|
1: mov %rdi,%rax
|
|
|
ret
|
|
|
+
|
|
|
+2: test $15,%edi
|
|
|
+ mov %rdi,%r8
|
|
|
+ mov %rax,-8(%rdi,%rdx)
|
|
|
+ mov %rdx,%rcx
|
|
|
+ jnz 2f
|
|
|
+
|
|
|
+1: shr $3,%rcx
|
|
|
+ rep
|
|
|
+ stosq
|
|
|
+ mov %r8,%rax
|
|
|
+ ret
|
|
|
+
|
|
|
+2: xor %edx,%edx
|
|
|
+ sub %edi,%edx
|
|
|
+ and $15,%edx
|
|
|
+ mov %rax,(%rdi)
|
|
|
+ mov %rax,8(%rdi)
|
|
|
+ sub %rdx,%rcx
|
|
|
+ add %rdx,%rdi
|
|
|
+ jmp 1b
|