[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions

They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2006-02-03 21:51:02 +0100
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-02-04 16:43:13 -0800
commit: 7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree: f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/copy_page.S
parent: 6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
1 files changed, 87 insertions, 0 deletions
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 621a1976940..8fa19d96a7e 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,7 +8,94 @@
 	.globl copy_page
 	.p2align 4
 copy_page:
+	subq	$3*8,%rsp
+	movq	%rbx,(%rsp)
+	movq	%r12,1*8(%rsp)
+	movq	%r13,2*8(%rsp)
+
+	movl	$(4096/64)-5,%ecx
+	.p2align 4
+.Loop64:
+  	dec     %rcx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	prefetcht0 5*64(%rsi)
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq    64 (%rsi), %rsi
+	leaq    64 (%rdi), %rdi
+
+	jnz     .Loop64
+
+	movl	$5,%ecx
+	.p2align 4
+.Loop2:
+	decl   %ecx
+
+	movq        (%rsi), %rax
+	movq      8 (%rsi), %rbx
+	movq     16 (%rsi), %rdx
+	movq     24 (%rsi), %r8
+	movq     32 (%rsi), %r9
+	movq     40 (%rsi), %r10
+	movq     48 (%rsi), %r11
+	movq     56 (%rsi), %r12
+
+	movq     %rax,    (%rdi)
+	movq     %rbx,  8 (%rdi)
+	movq     %rdx, 16 (%rdi)
+	movq     %r8,  24 (%rdi)
+	movq     %r9,  32 (%rdi)
+	movq     %r10, 40 (%rdi)
+	movq     %r11, 48 (%rdi)
+	movq     %r12, 56 (%rdi)
+
+	leaq	64(%rdi),%rdi
+	leaq	64(%rsi),%rsi
+
+	jnz	.Loop2
+
+	movq	(%rsp),%rbx
+	movq	1*8(%rsp),%r12
+	movq	2*8(%rsp),%r13
+	addq	$3*8,%rsp
+	ret
+
+	/* Some CPUs run faster using the string copy instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  copy_page
+	.quad  copy_page_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  copy_page_c_end-copy_page_c
+	.byte  copy_page_c_end-copy_page_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+copy_page_c:
 	movl $4096/8,%ecx
 	rep 
 	movsq 
 	ret
+copy_page_c_end:
+	.previous
author	Andi Kleen <ak@suse.de>	2006-02-03 21:51:02 +0100
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-02-04 16:43:13 -0800
commit	7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
tree	f0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib/copy_page.S
parent	6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)