From 185f3d38900f750a4566f87cde6a178f3595a115 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:17:08 +0200 Subject: x86_64: move lib Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- arch/x86/lib/Makefile_64 | 13 ++ arch/x86/lib/bitops_64.c | 175 ++++++++++++++++++ arch/x86/lib/bitstr_64.c | 28 +++ arch/x86/lib/clear_page_64.S | 59 ++++++ arch/x86/lib/copy_page_64.S | 119 ++++++++++++ arch/x86/lib/copy_user_64.S | 354 ++++++++++++++++++++++++++++++++++++ arch/x86/lib/copy_user_nocache_64.S | 217 ++++++++++++++++++++++ arch/x86/lib/csum-copy_64.S | 249 +++++++++++++++++++++++++ arch/x86/lib/csum-partial_64.c | 150 +++++++++++++++ arch/x86/lib/csum-wrappers_64.c | 135 ++++++++++++++ arch/x86/lib/delay_64.c | 57 ++++++ arch/x86/lib/getuser_64.S | 109 +++++++++++ arch/x86/lib/io_64.c | 23 +++ arch/x86/lib/iomap_copy_64.S | 30 +++ arch/x86/lib/memcpy_64.S | 131 +++++++++++++ arch/x86/lib/memmove_64.c | 21 +++ arch/x86/lib/memset_64.S | 133 ++++++++++++++ arch/x86/lib/putuser_64.S | 106 +++++++++++ arch/x86/lib/rwlock_64.S | 38 ++++ arch/x86/lib/thunk_64.S | 67 +++++++ arch/x86/lib/usercopy_64.c | 166 +++++++++++++++++ 22 files changed, 2381 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/Makefile_64 create mode 100644 arch/x86/lib/bitops_64.c create mode 100644 arch/x86/lib/bitstr_64.c create mode 100644 arch/x86/lib/clear_page_64.S create mode 100644 arch/x86/lib/copy_page_64.S create mode 100644 arch/x86/lib/copy_user_64.S create mode 100644 arch/x86/lib/copy_user_nocache_64.S create mode 100644 arch/x86/lib/csum-copy_64.S create mode 100644 arch/x86/lib/csum-partial_64.c create mode 100644 arch/x86/lib/csum-wrappers_64.c create mode 100644 arch/x86/lib/delay_64.c create mode 100644 arch/x86/lib/getuser_64.S create mode 100644 arch/x86/lib/io_64.c create mode 100644 arch/x86/lib/iomap_copy_64.S create mode 100644 arch/x86/lib/memcpy_64.S create mode 100644 arch/x86/lib/memmove_64.c create mode 100644 arch/x86/lib/memset_64.S create mode 100644 arch/x86/lib/putuser_64.S create mode 100644 arch/x86/lib/rwlock_64.S create mode 100644 arch/x86/lib/thunk_64.S create mode 100644 arch/x86/lib/usercopy_64.c (limited to 'arch/x86/lib') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 2d7d724a2a6..329da276c6f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -1,5 +1,5 @@ ifeq ($(CONFIG_X86_32),y) include ${srctree}/arch/x86/lib/Makefile_32 else -include ${srctree}/arch/x86_64/lib/Makefile_64 +include ${srctree}/arch/x86/lib/Makefile_64 endif diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64 new file mode 100644 index 00000000000..bbabad3c933 --- /dev/null +++ b/arch/x86/lib/Makefile_64 @@ -0,0 +1,13 @@ +# +# Makefile for x86_64-specific library files. +# + +CFLAGS_csum-partial_64.o := -funroll-loops + +obj-y := io_64.o iomap_copy_64.o +obj-$(CONFIG_SMP) += msr-on-cpu.o + +lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ + usercopy_64.o getuser_64.o putuser_64.o \ + thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o +lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c new file mode 100644 index 00000000000..95b6d9639fb --- /dev/null +++ b/arch/x86/lib/bitops_64.c @@ -0,0 +1,175 @@ +#include + +#undef find_first_zero_bit +#undef find_next_zero_bit +#undef find_first_bit +#undef find_next_bit + +static inline long +__find_first_zero_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1, d2; + long res; + + /* + * We must test the size in words, not in bits, because + * otherwise incoming sizes in the range -63..-1 will not run + * any scasq instructions, and then the flags used by the je + * instruction will have whatever random value was in place + * before. Nobody should call us like that, but + * find_next_zero_bit() does when offset and size are at the + * same word and it fails to find a zero itself. + */ + size += 63; + size >>= 6; + if (!size) + return 0; + asm volatile( + " repe; scasq\n" + " je 1f\n" + " xorq -8(%%rdi),%%rax\n" + " subq $8,%%rdi\n" + " bsfq %%rax,%%rdx\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rdx" + :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) + :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), + [addr] "S" (addr) : "memory"); + /* + * Any register would do for [addr] above, but GCC tends to + * prefer rbx over rsi, even though rsi is readily available + * and doesn't have to be saved. + */ + return res; +} + +/** + * find_first_zero_bit - find the first zero bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first zero bit, not the number of the byte + * containing a bit. + */ +long find_first_zero_bit(const unsigned long * addr, unsigned long size) +{ + return __find_first_zero_bit (addr, size); +} + +/** + * find_next_zero_bit - find the first zero bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_zero_bit (const unsigned long * addr, long size, long offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0; + unsigned long res, bit = offset&63; + + if (bit) { + /* + * Look for zero in first word + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0" + : "=r" (set) + : "r" (~(*p >> bit)), "r"(64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No zero yet, search remaining full words for a zero + */ + res = __find_first_zero_bit (p, size - 64 * (p - addr)); + + return (offset + set + res); +} + +static inline long +__find_first_bit(const unsigned long * addr, unsigned long size) +{ + long d0, d1; + long res; + + /* + * We must test the size in words, not in bits, because + * otherwise incoming sizes in the range -63..-1 will not run + * any scasq instructions, and then the flags used by the jz + * instruction will have whatever random value was in place + * before. Nobody should call us like that, but + * find_next_bit() does when offset and size are at the same + * word and it fails to find a one itself. + */ + size += 63; + size >>= 6; + if (!size) + return 0; + asm volatile( + " repe; scasq\n" + " jz 1f\n" + " subq $8,%%rdi\n" + " bsfq (%%rdi),%%rax\n" + "1: subq %[addr],%%rdi\n" + " shlq $3,%%rdi\n" + " addq %%rdi,%%rax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"0" (0ULL), "1" (size), "2" (addr), + [addr] "r" (addr) : "memory"); + return res; +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +long find_first_bit(const unsigned long * addr, unsigned long size) +{ + return __find_first_bit(addr,size); +} + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +long find_next_bit(const unsigned long * addr, long size, long offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0, bit = offset & 63, res; + + if (bit) { + /* + * Look for nonzero in the first 64 bits: + */ + asm("bsfq %1,%0\n\t" + "cmoveq %2,%0\n\t" + : "=r" (set) + : "r" (*p >> bit), "r" (64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = __find_first_bit (p, size - 64 * (p - addr)); + return (offset + set + res); +} + +#include + +EXPORT_SYMBOL(find_next_bit); +EXPORT_SYMBOL(find_first_bit); +EXPORT_SYMBOL(find_first_zero_bit); +EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c new file mode 100644 index 00000000000..24676609a6a --- /dev/null +++ b/arch/x86/lib/bitstr_64.c @@ -0,0 +1,28 @@ +#include +#include + +/* Find string of zero bits in a bitmap */ +unsigned long +find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) +{ + unsigned long n, end, i; + + again: + n = find_next_zero_bit(bitmap, nbits, start); + if (n == -1) + return -1; + + /* could test bitsliced, but it's hardly worth it */ + end = n+len; + if (end >= nbits) + return -1; + for (i = n+1; i < end; i++) { + if (test_bit(i, bitmap)) { + start = i+1; + goto again; + } + } + return n; +} + +EXPORT_SYMBOL(find_next_zero_string); diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S new file mode 100644 index 00000000000..9a10a78bb4a --- /dev/null +++ b/arch/x86/lib/clear_page_64.S @@ -0,0 +1,59 @@ +#include +#include + +/* + * Zero a page. + * rdi page + */ + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movq %rax,x*8(%rdi) + movq %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S new file mode 100644 index 00000000000..727a5d46d2f --- /dev/null +++ b/arch/x86/lib/copy_page_64.S @@ -0,0 +1,119 @@ +/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ + +#include +#include + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + +/* Don't use streaming store because it's better when the target + ends up in cache. */ + +/* Could vary the prefetch distance based on SMP/UP */ + +ENTRY(copy_page) + CFI_STARTPROC + subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 + movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 + movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 + + movl $(4096/64)-5,%ecx + .p2align 4 +.Loop64: + dec %rcx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + prefetcht0 5*64(%rsi) + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64 (%rsi), %rsi + leaq 64 (%rdi), %rdi + + jnz .Loop64 + + movl $5,%ecx + .p2align 4 +.Loop2: + decl %ecx + + movq (%rsi), %rax + movq 8 (%rsi), %rbx + movq 16 (%rsi), %rdx + movq 24 (%rsi), %r8 + movq 32 (%rsi), %r9 + movq 40 (%rsi), %r10 + movq 48 (%rsi), %r11 + movq 56 (%rsi), %r12 + + movq %rax, (%rdi) + movq %rbx, 8 (%rdi) + movq %rdx, 16 (%rdi) + movq %r8, 24 (%rdi) + movq %r9, 32 (%rdi) + movq %r10, 40 (%rdi) + movq %r11, 48 (%rdi) + movq %r12, 56 (%rdi) + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Loop2 + + movq (%rsp),%rbx + CFI_RESTORE rbx + movq 1*8(%rsp),%r12 + CFI_RESTORE r12 + movq 2*8(%rsp),%r13 + CFI_RESTORE r13 + addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 + ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S new file mode 100644 index 00000000000..70bebd31040 --- /dev/null +++ b/arch/x86/lib/copy_user_64.S @@ -0,0 +1,354 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include +#include + +#define FIX_ALIGNMENT 1 + +#include +#include +#include +#include + + .macro ALTERNATIVE_JUMP feature,orig,alt +0: + .byte 0xe9 /* 32bit jump */ + .long \orig-1f /* by default jump to orig */ +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 0b + .quad 2b + .byte \feature /* when feature is set */ + .byte 5 + .byte 5 + .previous + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(copy_to_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +/* Standard copy_from_user with segment limit checking */ +ENTRY(copy_from_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rsi,%rcx + addq %rdx,%rcx + jc bad_from_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_from_user + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(copy_from_user) + + .section .fixup,"ax" + /* must zero dest */ +bad_from_user: + CFI_STARTPROC + movl %edx,%ecx + xorl %eax,%eax + rep + stosb +bad_to_user: + movl %edx,%eax + ret + CFI_ENDPROC +END(bad_from_user) + .previous + + +/* + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq + * + * Input: + * rdi destination + * rsi source + * rdx count + * ecx zero flag -- if true zero destination on error + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_generic_unrolled) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + xorl %eax,%eax /*zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movq %r11,(%rdi) +.Ld2: movq %r8,1*8(%rdi) +.Ld3: movq %r9,2*8(%rdi) +.Ld4: movq %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movq %r11,4*8(%rdi) +.Ld6: movq %r8,5*8(%rdi) +.Ld7: movq %r9,6*8(%rdi) +.Ld8: movq %r10,7*8(%rdi) + + decq %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movq %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + + + /* Some CPUs run faster using the string copy instructions. + This is also a lot simpler. Use them when possible. + Patch in jmps to this code instead of copying it fully + to avoid unwanted aliasing in the exception tables. */ + + /* rdi destination + * rsi source + * rdx count + * ecx zero flag + * + * Output: + * eax uncopied bytes or 0 if successfull. + * + * Only 4GB of copy is supported. This shouldn't be a problem + * because the kernel normally only writes from/to page sized chunks + * even if user space passed a longer buffer. + * And more would be dangerous because both Intel and AMD have + * errata with rep movsq > 4GB. If someone feels the need to fix + * this please consider this. + */ +ENTRY(copy_user_generic_string) + CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + jz 10f +1: rep + movsq + movl %edx,%ecx +2: rep + movsb +9: movl %ecx,%eax + ret + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax + ret + + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret + CFI_ENDPROC +END(copy_user_generic_c) + + .section __ex_table,"a" + .quad 1b,3b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b + .previous diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S new file mode 100644 index 00000000000..4620efb12f1 --- /dev/null +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -0,0 +1,217 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include +#include + +#define FIX_ALIGNMENT 1 + +#include +#include +#include +#include + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + * + * Input: + * rdi destination + * rsi source + * rdx count + * rcx zero flag when 1 zero on exception + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx /* save zero flag */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + + xorl %eax,%eax /* zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movnti %r11,(%rdi) +.Ld2: movnti %r8,1*8(%rdi) +.Ld3: movnti %r9,2*8(%rdi) +.Ld4: movnti %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movnti %r11,4*8(%rdi) +.Ld6: movnti %r8,5*8(%rdi) +.Ld7: movnti %r9,6*8(%rdi) +.Ld8: movnti %r10,7*8(%rdi) + + dec %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movnti %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE %rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) /* zero flag set? */ + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(__copy_user_nocache) + + diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S new file mode 100644 index 00000000000..f0dba36578e --- /dev/null +++ b/arch/x86/lib/csum-copy_64.S @@ -0,0 +1,249 @@ +/* + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. No warranty for anything given at all. + */ +#include +#include +#include + +/* + * Checksum copy with exception handling. + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * destination is zeroed. + * + * Input + * rdi source + * rsi destination + * edx len (32bit) + * ecx sum (32bit) + * r8 src_err_ptr (int) + * r9 dst_err_ptr (int) + * + * Output + * eax 64bit sum. undefined in case of exception. + * + * Wrappers need to take care of valid exception sum and zeroing. + * They also should align source or destination to 8 bytes. + */ + + .macro source +10: + .section __ex_table,"a" + .align 8 + .quad 10b,.Lbad_source + .previous + .endm + + .macro dest +20: + .section __ex_table,"a" + .align 8 + .quad 20b,.Lbad_dest + .previous + .endm + + .macro ignore L=.Lignore +30: + .section __ex_table,"a" + .align 8 + .quad 30b,\L + .previous + .endm + + +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC + cmpl $3*64,%edx + jle .Lignore + +.Lignore: + subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 + movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 + movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 + movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 + movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 + + movq %r8,(%rsp) + movq %r9,1*8(%rsp) + + movl %ecx,%eax + movl %edx,%ecx + + xorl %r9d,%r9d + movq %rcx,%r12 + + shrq $6,%r12 + jz .Lhandle_tail /* < 64 */ + + clc + + /* main loop. clear in 64 byte blocks */ + /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ + /* r11: temp3, rdx: temp4, r12 loopcnt */ + /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */ + .p2align 4 +.Lloop: + source + movq (%rdi),%rbx + source + movq 8(%rdi),%r8 + source + movq 16(%rdi),%r11 + source + movq 24(%rdi),%rdx + + source + movq 32(%rdi),%r10 + source + movq 40(%rdi),%rbp + source + movq 48(%rdi),%r14 + source + movq 56(%rdi),%r13 + + ignore 2f + prefetcht0 5*64(%rdi) +2: + adcq %rbx,%rax + adcq %r8,%rax + adcq %r11,%rax + adcq %rdx,%rax + adcq %r10,%rax + adcq %rbp,%rax + adcq %r14,%rax + adcq %r13,%rax + + decl %r12d + + dest + movq %rbx,(%rsi) + dest + movq %r8,8(%rsi) + dest + movq %r11,16(%rsi) + dest + movq %rdx,24(%rsi) + + dest + movq %r10,32(%rsi) + dest + movq %rbp,40(%rsi) + dest + movq %r14,48(%rsi) + dest + movq %r13,56(%rsi) + +3: + + leaq 64(%rdi),%rdi + leaq 64(%rsi),%rsi + + jnz .Lloop + + adcq %r9,%rax + + /* do last upto 56 bytes */ +.Lhandle_tail: + /* ecx: count */ + movl %ecx,%r10d + andl $63,%ecx + shrl $3,%ecx + jz .Lfold + clc + .p2align 4 +.Lloop_8: + source + movq (%rdi),%rbx + adcq %rbx,%rax + decl %ecx + dest + movq %rbx,(%rsi) + leaq 8(%rsi),%rsi /* preserve carry */ + leaq 8(%rdi),%rdi + jnz .Lloop_8 + adcq %r9,%rax /* add in carry */ + +.Lfold: + /* reduce checksum to 32bits */ + movl %eax,%ebx + shrq $32,%rax + addl %ebx,%eax + adcl %r9d,%eax + + /* do last upto 6 bytes */ +.Lhandle_7: + movl %r10d,%ecx + andl $7,%ecx + shrl $1,%ecx + jz .Lhandle_1 + movl $2,%edx + xorl %ebx,%ebx + clc + .p2align 4 +.Lloop_1: + source + movw (%rdi),%bx + adcl %ebx,%eax + decl %ecx + dest + movw %bx,(%rsi) + leaq 2(%rdi),%rdi + leaq 2(%rsi),%rsi + jnz .Lloop_1 + adcl %r9d,%eax /* add in carry */ + + /* handle last odd byte */ +.Lhandle_1: + testl $1,%r10d + jz .Lende + xorl %ebx,%ebx + source + movb (%rdi),%bl + dest + movb %bl,(%rsi) + addl %ebx,%eax + adcl %r9d,%eax /* carry */ + + CFI_REMEMBER_STATE +.Lende: + movq 2*8(%rsp),%rbx + CFI_RESTORE rbx + movq 3*8(%rsp),%r12 + CFI_RESTORE r12 + movq 4*8(%rsp),%r14 + CFI_RESTORE r14 + movq 5*8(%rsp),%r13 + CFI_RESTORE r13 + movq 6*8(%rsp),%rbp + CFI_RESTORE rbp + addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 + ret + CFI_RESTORE_STATE + + /* Exception handlers. Very simple, zeroing is done in the wrappers */ +.Lbad_source: + movq (%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + +.Lbad_dest: + movq 8(%rsp),%rax + testq %rax,%rax + jz .Lende + movl $-EFAULT,(%rax) + jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c new file mode 100644 index 00000000000..bc503f50690 --- /dev/null +++ b/arch/x86/lib/csum-partial_64.c @@ -0,0 +1,150 @@ +/* + * arch/x86_64/lib/csum-partial.c + * + * This file contains network checksum routines that are better done + * in an architecture-specific manner due to speed. + */ + +#include +#include +#include + +static inline unsigned short from32to16(unsigned a) +{ + unsigned short b = a >> 16; + asm("addw %w2,%w0\n\t" + "adcw $0,%w0\n" + : "=r" (b) + : "0" (b), "r" (a)); + return b; +} + +/* + * Do a 64-bit checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. + * + * Things tried and found to not make it faster: + * Manual Prefetching + * Unrolling to an 128 bytes inner loop. + * Using interleaving with more registers to break the carry chains. + */ +static unsigned do_csum(const unsigned char *buff, unsigned len) +{ + unsigned odd, count; + unsigned long result = 0; + + if (unlikely(len == 0)) + return result; + odd = 1 & (unsigned long) buff; + if (unlikely(odd)) { + result = *buff << 8; + len--; + buff++; + } + count = len >> 1; /* nr of 16-bit words.. */ + if (count) { + if (2 & (unsigned long) buff) { + result += *(unsigned short *)buff; + count--; + len -= 2; + buff += 2; + } + count >>= 1; /* nr of 32-bit words.. */ + if (count) { + unsigned long zero; + unsigned count64; + if (4 & (unsigned long) buff) { + result += *(unsigned int *) buff; + count--; + len -= 4; + buff += 4; + } + count >>= 1; /* nr of 64-bit words.. */ + + /* main loop using 64byte blocks */ + zero = 0; + count64 = count >> 3; + while (count64) { + asm("addq 0*8(%[src]),%[res]\n\t" + "adcq 1*8(%[src]),%[res]\n\t" + "adcq 2*8(%[src]),%[res]\n\t" + "adcq 3*8(%[src]),%[res]\n\t" + "adcq 4*8(%[src]),%[res]\n\t" + "adcq 5*8(%[src]),%[res]\n\t" + "adcq 6*8(%[src]),%[res]\n\t" + "adcq 7*8(%[src]),%[res]\n\t" + "adcq %[zero],%[res]" + : [res] "=r" (result) + : [src] "r" (buff), [zero] "r" (zero), + "[res]" (result)); + buff += 64; + count64--; + } + + /* last upto 7 8byte blocks */ + count %= 8; + while (count) { + asm("addq %1,%0\n\t" + "adcq %2,%0\n" + : "=r" (result) + : "m" (*(unsigned long *)buff), + "r" (zero), "0" (result)); + --count; + buff += 8; + } + result = add32_with_carry(result>>32, + result&0xffffffff); + + if (len & 4) { + result += *(unsigned int *) buff; + buff += 4; + } + } + if (len & 2) { + result += *(unsigned short *) buff; + buff += 2; + } + } + if (len & 1) + result += *buff; + result = add32_with_carry(result>>32, result & 0xffffffff); + if (unlikely(odd)) { + result = from32to16(result); + result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); + } + return result; +} + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 64-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)add32_with_carry(do_csum(buff, len), + (__force u32)sum); +} + +EXPORT_SYMBOL(csum_partial); + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ +__sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff,len,0)); +} +EXPORT_SYMBOL(ip_compute_csum); + diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c new file mode 100644 index 00000000000..fd42a4a095f --- /dev/null +++ b/arch/x86/lib/csum-wrappers_64.c @@ -0,0 +1,135 @@ +/* Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v.2 + * + * Wrappers of assembly checksum functions for x86-64. + */ + +#include +#include + +/** + * csum_partial_copy_from_user - Copy and checksum from user space. + * @src: source address (user space) + * @dst: destination address + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad source address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum isum, int *errp) +{ + might_sleep(); + *errp = 0; + if (likely(access_ok(VERIFY_READ,src, len))) { + /* Why 6, not 7? To handle odd addresses aligned we + would need to do considerable complications to fix the + checksum which is defined as an 16bit accumulator. The + fix alignment code is primarily for performance + compatibility with 32bit and that will handle odd + addresses slowly too. */ + if (unlikely((unsigned long)src & 6)) { + while (((unsigned long)src & 6) && len >= 2) { + __u16 val16; + *errp = __get_user(val16, (const __u16 __user *)src); + if (*errp) + return isum; + *(__u16 *)dst = val16; + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); + src += 2; + dst += 2; + len -= 2; + } + } + isum = csum_partial_copy_generic((__force const void *)src, + dst, len, isum, errp, NULL); + if (likely(*errp == 0)) + return isum; + } + *errp = -EFAULT; + memset(dst,0,len); + return isum; +} + +EXPORT_SYMBOL(csum_partial_copy_from_user); + +/** + * csum_partial_copy_to_user - Copy and checksum to user space. + * @src: source address + * @dst: destination address (user space) + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * @errp: set to -EFAULT for an bad destination address. + * + * Returns an 32bit unfolded checksum of the buffer. + * src and dst are best aligned to 64bits. + */ +__wsum +csum_partial_copy_to_user(const void *src, void __user *dst, + int len, __wsum isum, int *errp) +{ + might_sleep(); + if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { + *errp = -EFAULT; + return 0; + } + + if (unlikely((unsigned long)dst & 6)) { + while (((unsigned long)dst & 6) && len >= 2) { + __u16 val16 = *(__u16 *)src; + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); + *errp = __put_user(val16, (__u16 __user *)dst); + if (*errp) + return isum; + src += 2; + dst += 2; + len -= 2; + } + } + + *errp = 0; + return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp); +} + +EXPORT_SYMBOL(csum_partial_copy_to_user); + +/** + * csum_partial_copy_nocheck - Copy and checksum. + * @src: source address + * @dst: destination address + * @len: number of bytes to be copied. + * @isum: initial sum that is added into the result (32bit unfolded) + * + * Returns an 32bit unfolded checksum of the buffer. + */ +__wsum +csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) +{ + return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); +} +EXPORT_SYMBOL(csum_partial_copy_nocheck); + +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, __wsum sum) +{ + __u64 rest, sum64; + + rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + + (__force __u64)sum; + asm(" addq (%[saddr]),%[sum]\n" + " adcq 8(%[saddr]),%[sum]\n" + " adcq (%[daddr]),%[sum]\n" + " adcq 8(%[daddr]),%[sum]\n" + " adcq $0,%[sum]\n" + : [sum] "=r" (sum64) + : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); + return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); +} + +EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c new file mode 100644 index 00000000000..2dbebd30834 --- /dev/null +++ b/arch/x86/lib/delay_64.c @@ -0,0 +1,57 @@ +/* + * Precise Delay Loops for x86-64 + * + * Copyright (C) 1993 Linus Torvalds + * Copyright (C) 1997 Martin Mares + * + * The __delay function must _NOT_ be inlined as its execution time + * depends wildly on alignment on many x86 processors. + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +#include +#endif + +int read_current_timer(unsigned long *timer_value) +{ + rdtscll(*timer_value); + return 0; +} + +void __delay(unsigned long loops) +{ + unsigned bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } + while((now-bclock) < loops); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S new file mode 100644 index 00000000000..5448876261f --- /dev/null +++ b/arch/x86/lib/getuser_64.S @@ -0,0 +1,109 @@ +/* + * __get_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ + +/* + * __get_user_X + * + * Inputs: %rcx contains the address. + * The register is modified, but all changes are undone + * before returning because the C code doesn't know about it. + * + * Outputs: %rax is error code (0 or -EFAULT) + * %rdx contains zero-extended value + * + * %r8 is destroyed. + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#include +#include +#include +#include +#include +#include + + .text +ENTRY(__get_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + cmpq threadinfo_addr_limit(%r8),%rcx + jae bad_get_user +1: movzb (%rcx),%edx + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__get_user_1) + +ENTRY(__get_user_2) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $1,%rcx + jc 20f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 20f + decq %rcx +2: movzwl (%rcx),%edx + xorl %eax,%eax + ret +20: decq %rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) + +ENTRY(__get_user_4) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $3,%rcx + jc 30f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 30f + subq $3,%rcx +3: movl (%rcx),%edx + xorl %eax,%eax + ret +30: subq $3,%rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) + +ENTRY(__get_user_8) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $7,%rcx + jc 40f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 40f + subq $7,%rcx +4: movq (%rcx),%rdx + xorl %eax,%eax + ret +40: subq $7,%rcx + jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) + +bad_get_user: + CFI_STARTPROC + xorl %edx,%edx + movq $(-EFAULT),%rax + ret + CFI_ENDPROC +END(bad_get_user) + +.section __ex_table,"a" + .quad 1b,bad_get_user + .quad 2b,bad_get_user + .quad 3b,bad_get_user + .quad 4b,bad_get_user +.previous diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c new file mode 100644 index 00000000000..87b4a4e1803 --- /dev/null +++ b/arch/x86/lib/io_64.c @@ -0,0 +1,23 @@ +#include +#include +#include + +void __memcpy_toio(unsigned long dst,const void*src,unsigned len) +{ + __inline_memcpy((void *) dst,src,len); +} +EXPORT_SYMBOL(__memcpy_toio); + +void __memcpy_fromio(void *dst,unsigned long src,unsigned len) +{ + __inline_memcpy(dst,(const void *) src,len); +} +EXPORT_SYMBOL(__memcpy_fromio); + +void memset_io(volatile void __iomem *a, int b, size_t c) +{ + /* XXX: memset can mangle the IO patterns quite a bit. + perhaps it would be better to use a dumb one */ + memset((void *)a,b,c); +} +EXPORT_SYMBOL(memset_io); diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S new file mode 100644 index 00000000000..05a95e713da --- /dev/null +++ b/arch/x86/lib/iomap_copy_64.S @@ -0,0 +1,30 @@ +/* + * Copyright 2006 PathScale, Inc. All Rights Reserved. + * + * This file is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include + +/* + * override generic version in lib/iomap_copy.c + */ +ENTRY(__iowrite32_copy) + CFI_STARTPROC + movl %edx,%ecx + rep movsd + ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S new file mode 100644 index 00000000000..c22981fa2f3 --- /dev/null +++ b/arch/x86/lib/memcpy_64.S @@ -0,0 +1,131 @@ +/* Copyright 2002 Andi Kleen */ + +#include +#include +#include + +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ + + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + movq %rdi,%rax + + movl %edx,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + + movq (%rsi),%r11 + movq 8(%rsi),%r8 + + movq %r11,(%rdi) + movq %r8,1*8(%rdi) + + movq 2*8(%rsi),%r9 + movq 3*8(%rsi),%r10 + + movq %r9,2*8(%rdi) + movq %r10,3*8(%rdi) + + movq 4*8(%rsi),%r11 + movq 5*8(%rsi),%r8 + + movq %r11,4*8(%rdi) + movq %r8,5*8(%rdi) + + movq 6*8(%rsi),%r9 + movq 7*8(%rsi),%r10 + + movq %r9,6*8(%rdi) + movq %r10,7*8(%rdi) + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jnz .Lloop_64 + +.Lhandle_tail: + movl %edx,%ecx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + .p2align 4 +.Lloop_8: + decl %ecx + movq (%rsi),%r8 + movq %r8,(%rdi) + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + movb (%rsi),%r8b + movb %r8b,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + +.Lende: + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret +.Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) + + /* Some CPUs run faster using the string copy instructions. + It is also a lot simpler. Use this when possible */ + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + /* Replace only beginning, memcpy is used to apply alternatives, so it + * is silly to overwrite itself with nops - reboot is only outcome... */ + .byte 2b - 1b + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c new file mode 100644 index 00000000000..751ebae8ec4 --- /dev/null +++ b/arch/x86/lib/memmove_64.c @@ -0,0 +1,21 @@ +/* Normally compiler builtins are used, but sometimes the compiler calls out + of line code. Based on asm-i386/string.h. + */ +#define _STRING_C +#include +#include + +#undef memmove +void *memmove(void * dest,const void *src,size_t count) +{ + if (dest < src) { + return memcpy(dest,src,count); + } else { + char *p = (char *) dest + count; + char *s = (char *) src + count; + while (count--) + *--p = *--s; + } + return dest; +} +EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S new file mode 100644 index 00000000000..2c5948116bd --- /dev/null +++ b/arch/x86/lib/memset_64.S @@ -0,0 +1,133 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include +#include + +/* + * ISO C memset - set a memory block to a byte value. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC + movq %rdi,%r10 + movq %rdx,%r11 + + /* expand byte value */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ + + /* align dst */ + movl %edi,%r9d + andl $7,%r9d + jnz .Lbad_alignment + CFI_REMEMBER_STATE +.Lafter_bad_alignment: + + movl %r11d,%ecx + shrl $6,%ecx + jz .Lhandle_tail + + .p2align 4 +.Lloop_64: + decl %ecx + movq %rax,(%rdi) + movq %rax,8(%rdi) + movq %rax,16(%rdi) + movq %rax,24(%rdi) + movq %rax,32(%rdi) + movq %rax,40(%rdi) + movq %rax,48(%rdi) + movq %rax,56(%rdi) + leaq 64(%rdi),%rdi + jnz .Lloop_64 + + /* Handle tail in loops. The loops should be faster than hard + to predict jump tables. */ + .p2align 4 +.Lhandle_tail: + movl %r11d,%ecx + andl $63&(~7),%ecx + jz .Lhandle_7 + shrl $3,%ecx + .p2align 4 +.Lloop_8: + decl %ecx + movq %rax,(%rdi) + leaq 8(%rdi),%rdi + jnz .Lloop_8 + +.Lhandle_7: + movl %r11d,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: + decl %ecx + movb %al,(%rdi) + leaq 1(%rdi),%rdi + jnz .Lloop_1 + +.Lende: + movq %r10,%rax + ret + + CFI_RESTORE_STATE +.Lbad_alignment: + cmpq $7,%r11 + jbe .Lhandle_7 + movq %rax,(%rdi) /* unaligned store */ + movq $8,%r8 + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 + jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) + + /* Some CPUs run faster using the string instructions. + It is also a lot simpler. Use this when possible */ + +#include + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous + .section .altinstructions,"a" + .align 8 + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b + .previous diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S new file mode 100644 index 00000000000..4989f5a8fa9 --- /dev/null +++ b/arch/x86/lib/putuser_64.S @@ -0,0 +1,106 @@ +/* + * __put_user functions. + * + * (C) Copyright 1998 Linus Torvalds + * (C) Copyright 2005 Andi Kleen + * + * These functions have a non-standard call interface + * to make them more efficient, especially as they + * return an error value in addition to the "real" + * return value. + */ + +/* + * __put_user_X + * + * Inputs: %rcx contains the address + * %rdx contains new value + * + * Outputs: %rax is error code (0 or -EFAULT) + * + * %r8 is destroyed. + * + * These functions should not modify any other registers, + * as they get called from within inline assembly. + */ + +#include +#include +#include +#include +#include +#include + + .text +ENTRY(__put_user_1) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + cmpq threadinfo_addr_limit(%r8),%rcx + jae bad_put_user +1: movb %dl,(%rcx) + xorl %eax,%eax + ret + CFI_ENDPROC +ENDPROC(__put_user_1) + +ENTRY(__put_user_2) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $1,%rcx + jc 20f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 20f + decq %rcx +2: movw %dx,(%rcx) + xorl %eax,%eax + ret +20: decq %rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) + +ENTRY(__put_user_4) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $3,%rcx + jc 30f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 30f + subq $3,%rcx +3: movl %edx,(%rcx) + xorl %eax,%eax + ret +30: subq $3,%rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) + +ENTRY(__put_user_8) + CFI_STARTPROC + GET_THREAD_INFO(%r8) + addq $7,%rcx + jc 40f + cmpq threadinfo_addr_limit(%r8),%rcx + jae 40f + subq $7,%rcx +4: movq %rdx,(%rcx) + xorl %eax,%eax + ret +40: subq $7,%rcx + jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) + +bad_put_user: + CFI_STARTPROC + movq $(-EFAULT),%rax + ret + CFI_ENDPROC +END(bad_put_user) + +.section __ex_table,"a" + .quad 1b,bad_put_user + .quad 2b,bad_put_user + .quad 3b,bad_put_user + .quad 4b,bad_put_user +.previous diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S new file mode 100644 index 00000000000..0cde1f80731 --- /dev/null +++ b/arch/x86/lib/rwlock_64.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include +#include +#include +#include + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S new file mode 100644 index 00000000000..55e586d352d --- /dev/null +++ b/arch/x86/lib/thunk_64.S @@ -0,0 +1,67 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + + #include + #include + #include + #include + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro thunk name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + call \func + jmp restore + CFI_ENDPROC + .endm + + /* rdi: arg1 ... normal C conventions. rax is passed from C. */ + .macro thunk_retrax name,func + .globl \name +\name: + CFI_STARTPROC + SAVE_ARGS + call \func + jmp restore_norax + CFI_ENDPROC + .endm + + + .section .sched.text +#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM + thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed + thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed + thunk rwsem_wake_thunk,rwsem_wake + thunk rwsem_downgrade_thunk,rwsem_downgrade_wake +#endif + + thunk __down_failed,__down + thunk_retrax __down_failed_interruptible,__down_interruptible + thunk_retrax __down_failed_trylock,__down_trylock + thunk __up_wakeup,__up + +#ifdef CONFIG_TRACE_IRQFLAGS + thunk trace_hardirqs_on_thunk,trace_hardirqs_on + thunk trace_hardirqs_off_thunk,trace_hardirqs_off +#endif + + /* SAVE_ARGS below is used only for the .cfi directives it contains. */ + CFI_STARTPROC + SAVE_ARGS +restore: + RESTORE_ARGS + ret + CFI_ENDPROC + + CFI_STARTPROC + SAVE_ARGS +restore_norax: + RESTORE_ARGS 1 + ret + CFI_ENDPROC diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c new file mode 100644 index 00000000000..893d43f838c --- /dev/null +++ b/arch/x86/lib/usercopy_64.c @@ -0,0 +1,166 @@ +/* + * User address space access functions. + * + * Copyright 1997 Andi Kleen + * Copyright 1997 Linus Torvalds + * Copyright 2002 Andi Kleen + */ +#include +#include + +/* + * Copy a null terminated string from userspace. + */ + +#define __do_strncpy_from_user(dst,src,count,res) \ +do { \ + long __d0, __d1, __d2; \ + might_sleep(); \ + __asm__ __volatile__( \ + " testq %1,%1\n" \ + " jz 2f\n" \ + "0: lodsb\n" \ + " stosb\n" \ + " testb %%al,%%al\n" \ + " jz 1f\n" \ + " decq %1\n" \ + " jnz 0b\n" \ + "1: subq %1,%0\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movq %5,%0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + " .align 8\n" \ + " .quad 0b,3b\n" \ + ".previous" \ + : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ + "=&D" (__d2) \ + : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ + : "memory"); \ +} while (0) + +long +__strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res; + __do_strncpy_from_user(dst, src, count, res); + return res; +} +EXPORT_SYMBOL(__strncpy_from_user); + +long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + long res = -EFAULT; + if (access_ok(VERIFY_READ, src, 1)) + return __strncpy_from_user(dst, src, count); + return res; +} +EXPORT_SYMBOL(strncpy_from_user); + +/* + * Zero Userspace + */ + +unsigned long __clear_user(void __user *addr, unsigned long size) +{ + long __d0; + might_sleep(); + /* no memory constraint because it doesn't change any memory gcc knows + about */ + asm volatile( + " testq %[size8],%[size8]\n" + " jz 4f\n" + "0: movq %[zero],(%[dst])\n" + " addq %[eight],%[dst]\n" + " decl %%ecx ; jnz 0b\n" + "4: movq %[size1],%%rcx\n" + " testl %%ecx,%%ecx\n" + " jz 2f\n" + "1: movb %b[zero],(%[dst])\n" + " incq %[dst]\n" + " decl %%ecx ; jnz 1b\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: lea 0(%[size1],%[size8],8),%[size8]\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 0b,3b\n" + " .quad 1b,2b\n" + ".previous" + : [size8] "=c"(size), [dst] "=&D" (__d0) + : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), + [zero] "r" (0UL), [eight] "r" (8UL)); + return size; +} +EXPORT_SYMBOL(__clear_user); + +unsigned long clear_user(void __user *to, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + return __clear_user(to, n); + return n; +} +EXPORT_SYMBOL(clear_user); + +/* + * Return the size of a string (including the ending 0) + * + * Return 0 on exception, a value greater than N if too long + */ + +long __strnlen_user(const char __user *s, long n) +{ + long res = 0; + char c; + + while (1) { + if (res>n) + return n+1; + if (__get_user(c, s)) + return 0; + if (!c) + return res+1; + res++; + s++; + } +} +EXPORT_SYMBOL(__strnlen_user); + +long strnlen_user(const char __user *s, long n) +{ + if (!access_ok(VERIFY_READ, s, n)) + return 0; + return __strnlen_user(s, n); +} +EXPORT_SYMBOL(strnlen_user); + +long strlen_user(const char __user *s) +{ + long res = 0; + char c; + + for (;;) { + if (get_user(c, s)) + return 0; + if (!c) + return res+1; + res++; + s++; + } +} +EXPORT_SYMBOL(strlen_user); + +unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) +{ + if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { + return copy_user_generic((__force void *)to, (__force void *)from, len); + } + return len; +} +EXPORT_SYMBOL(copy_in_user); + -- cgit v1.2.3