diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-09-26 13:07:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-09-26 13:07:55 -0700 |
commit | b278240839e20fa9384ea430df463b367b90e04e (patch) | |
tree | f99f0c8cdd4cc7f177cd75440e6bd181cded7fb3 /arch/x86_64/lib | |
parent | dd77a4ee0f3981693d4229aa1d57cea9e526ff47 (diff) | |
parent | 3f75f42d7733e73aca5c78326489efd4189e0111 (diff) |
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (225 commits)
[PATCH] Don't set calgary iommu as default y
[PATCH] i386/x86-64: New Intel feature flags
[PATCH] x86: Add a cumulative thermal throttle event counter.
[PATCH] i386: Make the jiffies compares use the 64bit safe macros.
[PATCH] x86: Refactor thermal throttle processing
[PATCH] Add 64bit jiffies compares (for use with get_jiffies_64)
[PATCH] Fix unwinder warning in traps.c
[PATCH] x86: Allow disabling early pci scans with pci=noearly or disallowing conf1
[PATCH] x86: Move direct PCI scanning functions out of line
[PATCH] i386/x86-64: Make all early PCI scans dependent on CONFIG_PCI
[PATCH] Don't leak NT bit into next task
[PATCH] i386/x86-64: Work around gcc bug with noreturn functions in unwinder
[PATCH] Fix some broken white space in ia32_signal.c
[PATCH] Initialize argument registers for 32bit signal handlers.
[PATCH] Remove all traces of signal number conversion
[PATCH] Don't synchronize time reading on single core AMD systems
[PATCH] Remove outdated comment in x86-64 mmconfig code
[PATCH] Use string instructions for Core2 copy/clear
[PATCH] x86: - restore i8259A eoi status on resume
[PATCH] i386: Split multi-line printk in oops output.
...
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r-- | arch/x86_64/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/x86_64/lib/clear_page.S | 47 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_page.S | 53 | ||||
-rw-r--r-- | arch/x86_64/lib/copy_user.S | 153 | ||||
-rw-r--r-- | arch/x86_64/lib/csum-copy.S | 26 | ||||
-rw-r--r-- | arch/x86_64/lib/getuser.S | 32 | ||||
-rw-r--r-- | arch/x86_64/lib/iomap_copy.S | 10 | ||||
-rw-r--r-- | arch/x86_64/lib/memcpy.S | 69 | ||||
-rw-r--r-- | arch/x86_64/lib/memset.S | 79 | ||||
-rw-r--r-- | arch/x86_64/lib/putuser.S | 32 | ||||
-rw-r--r-- | arch/x86_64/lib/rwlock.S | 38 | ||||
-rw-r--r-- | arch/x86_64/lib/thunk.S | 43 |
12 files changed, 363 insertions, 221 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index ccef6ae747a..b78d4170fce 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 1f81b79b796..9a10a78bb4a 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -1,10 +1,22 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * Zero a page. * rdi page */ - .globl clear_page - .p2align 4 -clear_page: + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -23,28 +35,25 @@ clear_page: jnz .Lloop nop ret -clear_page_end: + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c - .byte X86_FEATURE_REP_GOOD - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c - .previous - - .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx - xorl %eax,%eax - rep - stosq - ret -clear_page_c_end: + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 8fa19d96a7e..0ebb03b60e7 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -1,17 +1,33 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + /* Don't use streaming store because it's better when the target ends up in cache. */ /* Could vary the prefetch distance based on SMP/UP */ - .globl copy_page - .p2align 4 -copy_page: +ENTRY(copy_page) + CFI_STARTPROC subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -72,30 +88,33 @@ copy_page: jnz .Loop2 movq (%rsp),%rbx + CFI_RESTORE rbx movq 1*8(%rsp),%r12 + CFI_RESTORE r12 movq 2*8(%rsp),%r13 + CFI_RESTORE r13 addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad copy_page - .quad copy_page_c - .byte X86_FEATURE_REP_GOOD - .byte copy_page_c_end-copy_page_c - .byte copy_page_c_end-copy_page_c - .previous - - .section .altinstr_replacement,"ax" -copy_page_c: - movl $4096/8,%ecx - rep - movsq - ret -copy_page_c_end: + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index f64569b83b5..70bebd31040 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,56 +4,78 @@ * Functions to copy from and to user space. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + #define FIX_ALIGNMENT 1 - #include <asm/current.h> - #include <asm/asm-offsets.h> - #include <asm/thread_info.h> - #include <asm/cpufeature.h> +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> -/* Standard copy_to_user with segment limit checking */ - .globl copy_to_user - .p2align 4 -copy_to_user: - GET_THREAD_INFO(%rax) - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user -2: + .macro ALTERNATIVE_JUMP feature,orig,alt +0: .byte 0xe9 /* 32bit jump */ - .long .Lcug-1f + .long \orig-1f /* by default jump to orig */ 1: - .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ - .long copy_user_generic_c-1b /* offset */ +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ .previous .section .altinstructions,"a" .align 8 + .quad 0b .quad 2b - .quad 3b - .byte X86_FEATURE_REP_GOOD + .byte \feature /* when feature is set */ .byte 5 .byte 5 .previous + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(copy_to_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC /* Standard copy_from_user with segment limit checking */ - .globl copy_from_user - .p2align 4 -copy_from_user: +ENTRY(copy_from_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx jc bad_from_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user - /* FALL THROUGH to copy_user_generic */ + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(copy_from_user) .section .fixup,"ax" /* must zero dest */ bad_from_user: + CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -61,40 +83,32 @@ bad_from_user: bad_to_user: movl %edx,%eax ret + CFI_ENDPROC +END(bad_from_user) .previous /* - * copy_user_generic - memory copy with exception handling. + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq * * Input: * rdi destination * rsi source * rdx count + * ecx zero flag -- if true zero destination on error * * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic - .p2align 4 -copy_user_generic: - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ - .byte 0x66,0x90 -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_user_generic - .quad 2b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous -.Lcug: +ENTRY(copy_user_generic_unrolled) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -168,9 +182,16 @@ copy_user_generic: decl %ecx jnz .Lloop_1 + CFI_REMEMBER_STATE .Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret + CFI_RESTORE_STATE #ifdef FIX_ALIGNMENT /* align destination */ @@ -252,6 +273,8 @@ copy_user_generic: addl %ecx,%edx /* edx: bytes to zero, rdi: dest, eax:zero */ .Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero movq %rdx,%rcx .Le_byte: xorl %eax,%eax @@ -261,6 +284,9 @@ copy_user_generic: .Le_zero: movq %rdx,%rax jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + /* Some CPUs run faster using the string copy instructions. This is also a lot simpler. Use them when possible. @@ -270,6 +296,7 @@ copy_user_generic: /* rdi destination * rsi source * rdx count + * ecx zero flag * * Output: * eax uncopied bytes or 0 if successfull. @@ -280,22 +307,48 @@ copy_user_generic: * And more would be dangerous because both Intel and AMD have * errata with rep movsq > 4GB. If someone feels the need to fix * this please consider this. - */ -copy_user_generic_c: + */ +ENTRY(copy_user_generic_string) + CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ movl %edx,%ecx shrl $3,%ecx andl $7,%edx + jz 10f 1: rep movsq movl %edx,%ecx 2: rep movsb -4: movl %ecx,%eax +9: movl %ecx,%eax ret -3: lea (%rdx,%rcx,8),%rax + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax ret + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret + CFI_ENDPROC +END(copy_user_generic_c) + .section __ex_table,"a" .quad 1b,3b - .quad 2b,4b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b .previous diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S index 72fd55ee896..f0dba36578e 100644 --- a/arch/x86_64/lib/csum-copy.S +++ b/arch/x86_64/lib/csum-copy.S @@ -5,8 +5,9 @@ * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. */ - #include <linux/linkage.h> - #include <asm/errno.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/errno.h> /* * Checksum copy with exception handling. @@ -53,19 +54,24 @@ .endm - .globl csum_partial_copy_generic - .p2align 4 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC cmpl $3*64,%edx jle .Lignore .Lignore: subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 movq %r8,(%rsp) movq %r9,1*8(%rsp) @@ -208,14 +214,22 @@ csum_partial_copy_generic: addl %ebx,%eax adcl %r9d,%eax /* carry */ + CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp),%rbx + CFI_RESTORE rbx movq 3*8(%rsp),%r12 + CFI_RESTORE r12 movq 4*8(%rsp),%r14 + CFI_RESTORE r14 movq 5*8(%rsp),%r13 + CFI_RESTORE r13 movq 6*8(%rsp),%rbp + CFI_RESTORE rbp addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 ret + CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -231,3 +245,5 @@ csum_partial_copy_generic: jz .Lende movl $-EFAULT,(%rax) jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S index 3844d5e885a..5448876261f 100644 --- a/arch/x86_64/lib/getuser.S +++ b/arch/x86_64/lib/getuser.S @@ -27,25 +27,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_get_user 1: movzb (%rcx),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) - .p2align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -57,10 +58,11 @@ __get_user_2: ret 20: decq %rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) - .p2align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -72,10 +74,11 @@ __get_user_4: ret 30: subq $3,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) - .p2align 4 -.globl __get_user_8 -__get_user_8: +ENTRY(__get_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -87,11 +90,16 @@ __get_user_8: ret 40: subq $7,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .quad 1b,bad_get_user diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S index 8bbade5fea0..05a95e713da 100644 --- a/arch/x86_64/lib/iomap_copy.S +++ b/arch/x86_64/lib/iomap_copy.S @@ -15,12 +15,16 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * override generic version in lib/iomap_copy.c */ - .globl __iowrite32_copy - .p2align 4 -__iowrite32_copy: +ENTRY(__iowrite32_copy) + CFI_STARTPROC movl %edx,%ecx rep movsd ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 5554948b555..967b22fa7d0 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -1,6 +1,10 @@ /* Copyright 2002 Andi Kleen */ - #include <asm/cpufeature.h> +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> + /* * memcpy - Copy a memory block. * @@ -13,12 +17,26 @@ * rax original destination */ - .globl __memcpy - .globl memcpy - .p2align 4 -__memcpy: -memcpy: + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx @@ -86,36 +104,27 @@ memcpy: .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret .Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep - movsq - movl %edx,%ecx - rep - movsb - ret -memcpy_c_end: + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memcpy + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index ad397f2c7de..09ed1f6b0ea 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,4 +1,9 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * ISO C memset - set a memory block to a byte value. * @@ -8,11 +13,29 @@ * * rax original destination */ - .globl __memset - .globl memset - .p2align 4 -memset: -__memset: + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC movq %rdi,%r10 movq %rdx,%r11 @@ -25,6 +48,7 @@ __memset: movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment + CFI_REMEMBER_STATE .Lafter_bad_alignment: movl %r11d,%ecx @@ -75,6 +99,7 @@ __memset: movq %r10,%rax ret + CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%r11 jbe .Lhandle_7 @@ -84,42 +109,26 @@ __memset: addq %r8,%rdi subq %r8,%r11 jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memset - .quad memset_c - .byte X86_FEATURE_REP_GOOD - .byte memset_c_end-memset_c - .byte memset_c_end-memset_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi value - * rdx count - */ -memset_c: - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep - stosq - movl %r8d,%ecx - rep - stosb - movq %r9,%rax - ret -memset_c_end: + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S index 7f5593974e2..4989f5a8fa9 100644 --- a/arch/x86_64/lib/putuser.S +++ b/arch/x86_64/lib/putuser.S @@ -25,25 +25,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_put_user 1: movb %dl,(%rcx) xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__put_user_1) - .p2align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -55,10 +56,11 @@ __put_user_2: ret 20: decq %rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) - .p2align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -70,10 +72,11 @@ __put_user_4: ret 30: subq $3,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) - .p2align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -85,10 +88,15 @@ __put_user_8: ret 40: subq $7,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_put_user) .section __ex_table,"a" .quad 1b,bad_put_user diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S new file mode 100644 index 00000000000..0cde1f80731 --- /dev/null +++ b/arch/x86_64/lib/rwlock.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.i> +#include <asm/dwarf2.h> + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 332ea5dff91..0025535cac8 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -1,10 +1,9 @@ - /* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ - */ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ #include <linux/config.h> #include <linux/linkage.h> @@ -67,33 +66,3 @@ restore_norax: RESTORE_ARGS 1 ret CFI_ENDPROC - -#ifdef CONFIG_SMP -/* Support for read/write spinlocks. */ - .text -/* rax: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - lock - addl $RW_LOCK_BIAS,(%rax) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rax) - jne 1b - lock - subl $RW_LOCK_BIAS,(%rax) - jnz __write_lock_failed - ret - -/* rax: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - lock - incl (%rax) -1: rep - nop - cmpl $1,(%rax) - js 1b - lock - decl (%rax) - js __read_lock_failed - ret -#endif |