aboutsummaryrefslogtreecommitdiff
path: root/arch/x86_64/lib/memcpy.S
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-11-05 17:25:54 +0100
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-14 19:55:17 -0800
commita5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memcpy.S
parenta6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)
[PATCH] x86_64: Remove optimization for B stepping AMD K8
B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/memcpy.S')
-rw-r--r--arch/x86_64/lib/memcpy.S93
1 files changed, 2 insertions, 91 deletions
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index c6c46494fef..92dd8054460 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,6 +11,8 @@
*
* Output:
* rax original destination
+ *
+ * TODO: check best memcpy for PSC
*/
.globl __memcpy
@@ -18,95 +20,6 @@
.p2align 4
__memcpy:
memcpy:
- pushq %rbx
- movq %rdi,%rax
-
- movl %edx,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
- .p2align 4
-.Lloop_64:
- decl %ecx
-
- movq (%rsi),%r11
- movq 8(%rsi),%r8
-
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
-
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
-
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
-
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
-
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
-.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
- .p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
- incq %rdi
- incq %rsi
- decl %ecx
- jnz .Lloop_1
-
-.Lende:
- popq %rbx
- ret
-.Lfinal:
-
- /* C stepping K8 run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
- .section .altinstructions,"a"
- .align 8
- .quad memcpy
- .quad memcpy_c
- .byte X86_FEATURE_K8_C
- .byte .Lfinal-memcpy
- .byte memcpy_c_end-memcpy_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi source
- * rdx count
- */
-memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
@@ -117,5 +30,3 @@ memcpy_c:
rep
movsb
ret
-memcpy_c_end:
- .previous