From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@ppc970.osdl.org>
Date: Sat, 16 Apr 2005 15:20:36 -0700
Subject: Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
---
 arch/x86_64/lib/memset.S | 125 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 arch/x86_64/lib/memset.S

(limited to 'arch/x86_64/lib/memset.S')

diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 00000000000..4b4c4063864
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *	
+ * rdi   destination
+ * rsi   value (char) 
+ * rdx   count (bytes) 
+ * 
+ * rax   original destination
+ */	
+ 	.globl __memset
+	.globl memset
+	.p2align 4
+memset:	
+__memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d		
+	andl  $7,%r9d	
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+	
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:	
+	decl   %ecx
+	movq  %rax,(%rdi) 
+	movq  %rax,8(%rdi) 
+	movq  %rax,16(%rdi) 
+	movq  %rax,24(%rdi) 
+	movq  %rax,32(%rdi) 
+	movq  %rax,40(%rdi) 
+	movq  %rax,48(%rdi) 
+	movq  %rax,56(%rdi) 
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */ 
+	.p2align 4	   
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+	
+.Lende:	
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8			
+	subq %r9,%r8 
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* C stepping K8 run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>	
+		
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_K8_C
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */			
+memset_c:	
+	movq %rdi,%r9
+	movl %edx,%r8d
+	andl $7,%r8d		
+	movl %edx,%ecx
+	shrl $3,%ecx		
+	/* expand byte value  */
+	movzbl %sil,%esi
+	movabs $0x0101010101010101,%rax
+	mulq   %rsi		/* with rax, clobbers rdx */
+	rep
+	stosq	
+	movl %r8d,%ecx
+	rep
+	stosb
+	movq %r9,%rax
+	ret
+memset_c_end:
+	.previous
-- 
cgit v1.2.3