1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
/* Copyright 2002 Andi Kleen, SuSE Labs */
#include <linux/linkage.h>
#include <asm/dwarf2.h>
/*
* ISO C memset - set a memory block to a byte value.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
ALIGN
memset_c:
CFI_STARTPROC
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
movl %edx,%ecx
shrl $3,%ecx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
mulq %rsi /* with rax, clobbers rdx */
rep stosq
movl %r8d,%ecx
rep stosb
movq %r9,%rax
ret
CFI_ENDPROC
ENDPROC(memset_c)
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
movq %rdi,%r10
movq %rdx,%r11
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
mul %rcx /* with rax, clobbers rdx */
/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
CFI_REMEMBER_STATE
.Lafter_bad_alignment:
movl %r11d,%ecx
shrl $6,%ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decl %ecx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %r11d,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8
.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
decl %ecx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
.Lende:
movq %r10,%rax
ret
CFI_RESTORE_STATE
.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
ENDPROC(memset)
ENDPROC(__memset)
/* Some CPUs run faster using the string instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (memset_c - memset) - (2f - 1b) /* offset */
2:
.previous
.section .altinstructions,"a"
.align 8
.quad memset
.quad 1b
.byte X86_FEATURE_REP_GOOD
.byte .Lfinal - memset
.byte 2b - 1b
.previous
|