From 70d64ceaa1a84d2502405422a4dfd3f87786a347 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 10 Oct 2005 21:52:43 +1000 Subject: powerpc: Rename files to have consistent _32/_64 suffixes This doesn't change any code, just renames things so we consistently have foo_32.c and foo_64.c where we have separate 32- and 64-bit versions. Signed-off-by: Paul Mackerras --- arch/powerpc/lib/Makefile | 6 +- arch/powerpc/lib/checksum.S | 225 ---------------- arch/powerpc/lib/checksum64.S | 229 ---------------- arch/powerpc/lib/checksum_32.S | 225 ++++++++++++++++ arch/powerpc/lib/checksum_64.S | 229 ++++++++++++++++ arch/powerpc/lib/copy32.S | 543 -------------------------------------- arch/powerpc/lib/copy_32.S | 543 ++++++++++++++++++++++++++++++++++++++ arch/powerpc/lib/copypage.S | 121 --------- arch/powerpc/lib/copypage_64.S | 121 +++++++++ arch/powerpc/lib/copyuser.S | 576 ----------------------------------------- arch/powerpc/lib/copyuser_64.S | 576 +++++++++++++++++++++++++++++++++++++++++ arch/powerpc/lib/mem_64.S | 119 +++++++++ arch/powerpc/lib/memcpy.S | 172 ------------ arch/powerpc/lib/memcpy_64.S | 172 ++++++++++++ arch/powerpc/lib/usercopy.c | 41 --- arch/powerpc/lib/usercopy_64.c | 41 +++ 16 files changed, 2029 insertions(+), 1910 deletions(-) delete mode 100644 arch/powerpc/lib/checksum.S delete mode 100644 arch/powerpc/lib/checksum64.S create mode 100644 arch/powerpc/lib/checksum_32.S create mode 100644 arch/powerpc/lib/checksum_64.S delete mode 100644 arch/powerpc/lib/copy32.S create mode 100644 arch/powerpc/lib/copy_32.S delete mode 100644 arch/powerpc/lib/copypage.S create mode 100644 arch/powerpc/lib/copypage_64.S delete mode 100644 arch/powerpc/lib/copyuser.S create mode 100644 arch/powerpc/lib/copyuser_64.S create mode 100644 arch/powerpc/lib/mem_64.S delete mode 100644 arch/powerpc/lib/memcpy.S create mode 100644 arch/powerpc/lib/memcpy_64.S delete mode 100644 arch/powerpc/lib/usercopy.c create mode 100644 arch/powerpc/lib/usercopy_64.c (limited to 'arch/powerpc/lib') diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 347f9798e43..a8cedb96de5 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,7 +3,7 @@ # obj-y := strcase.o string.o -obj-$(CONFIG_PPC32) += div64.o copy32.o checksum.o -obj-$(CONFIG_PPC64) += copypage.o copyuser.o memcpy.o usercopy.o \ - sstep.o checksum64.o +obj-$(CONFIG_PPC32) += div64.o copy_32.o checksum_32.o +obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o memcpy_64.o \ + usercopy_64.o sstep.o checksum_64.o mem_64.o obj-$(CONFIG_PPC_ISERIES) += e2a.o diff --git a/arch/powerpc/lib/checksum.S b/arch/powerpc/lib/checksum.S deleted file mode 100644 index 7874e8a8045..00000000000 --- a/arch/powerpc/lib/checksum.S +++ /dev/null @@ -1,225 +0,0 @@ -/* - * This file contains assembly-language implementations - * of IP-style 1's complement checksum routines. - * - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). - */ - -#include -#include -#include -#include - - .text - -/* - * ip_fast_csum(buf, len) -- Optimized for IP header - * len is in words and is always >= 5. - */ -_GLOBAL(ip_fast_csum) - lwz r0,0(r3) - lwzu r5,4(r3) - addic. r4,r4,-2 - addc r0,r0,r5 - mtctr r4 - blelr- -1: lwzu r4,4(r3) - adde r0,r0,r4 - bdnz 1b - addze r0,r0 /* add in final carry */ - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* - * Compute checksum of TCP or UDP pseudo-header: - * csum_tcpudp_magic(saddr, daddr, len, proto, sum) - */ -_GLOBAL(csum_tcpudp_magic) - rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ - addc r0,r3,r4 /* add 4 32-bit words together */ - adde r0,r0,r5 - adde r0,r0,r7 - addze r0,r0 /* add in final carry */ - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * csum_partial(buff, len, sum) - */ -_GLOBAL(csum_partial) - addic r0,r5,0 - subi r3,r3,4 - srwi. r6,r4,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r5,r3,2 /* Align buffer to longword boundary */ - beq+ 1f - lhz r5,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r0,r0,r5 - srwi. r6,r4,2 /* # words to do */ - beq 3f -1: mtctr r6 -2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ - adde r0,r0,r5 /* be unnecessary to unroll this loop */ - bdnz 2b - andi. r4,r4,3 -3: cmpwi 0,r4,2 - blt+ 4f - lhz r5,4(r3) - addi r3,r3,2 - subi r4,r4,2 - adde r0,r0,r5 -4: cmpwi 0,r4,1 - bne+ 5f - lbz r5,4(r3) - slwi r5,r5,8 /* Upper byte of word */ - adde r0,r0,r5 -5: addze r3,r0 /* add in final carry */ - blr - -/* - * Computes the checksum of a memory block at src, length len, - * and adds in "sum" (32-bit), while copying the block to dst. - * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) - */ -_GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: srwi. r6,r5,4 /* # groups of 4 words to do */ - beq 10f - mtctr r6 -71: lwz r6,4(r3) -72: lwz r9,8(r3) -73: lwz r10,12(r3) -74: lwzu r11,16(r3) - adde r0,r0,r6 -75: stw r6,4(r4) - adde r0,r0,r9 -76: stw r9,8(r4) - adde r0,r0,r10 -77: stw r10,12(r4) - adde r0,r0,r11 -78: stwu r11,16(r4) - bdnz 71b -10: rlwinm. r6,r5,30,30,31 /* # words left to do */ - beq 13f - mtctr r6 -82: lwzu r9,4(r3) -92: stwu r9,4(r4) - adde r0,r0,r9 - bdnz 82b -13: andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) - addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) - addi r4,r4,2 - adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ - adde r0,r0,r6 -5: addze r3,r0 /* add in final carry */ - blr - -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ - -src_error_4: - mfctr r6 /* update # bytes remaining from ctr */ - rlwimi r5,r6,4,0,27 - b 79f -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 -79: srwi. r6,r5,2 - beq 3f - mtctr r6 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b -src_error: - cmpwi 0,r7,0 - beq 1f - li r6,-EFAULT - stw r6,0(r7) -1: addze r3,r0 - blr - -dst_error: - cmpwi 0,r8,0 - beq 1f - li r6,-EFAULT - stw r6,0(r8) -1: addze r3,r0 - blr - -.section __ex_table,"a" - .long 81b,src_error_1 - .long 91b,dst_error - .long 71b,src_error_4 - .long 72b,src_error_4 - .long 73b,src_error_4 - .long 74b,src_error_4 - .long 75b,dst_error - .long 76b,dst_error - .long 77b,dst_error - .long 78b,dst_error - .long 82b,src_error_2 - .long 92b,dst_error - .long 83b,src_error_3 - .long 93b,dst_error - .long 84b,src_error_3 - .long 94b,dst_error - .long 95b,dst_error - .long 96b,dst_error - .long 97b,dst_error diff --git a/arch/powerpc/lib/checksum64.S b/arch/powerpc/lib/checksum64.S deleted file mode 100644 index ef96c6c58ef..00000000000 --- a/arch/powerpc/lib/checksum64.S +++ /dev/null @@ -1,229 +0,0 @@ -/* - * This file contains assembly-language implementations - * of IP-style 1's complement checksum routines. - * - * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). - */ - -#include -#include -#include -#include - -/* - * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header - * len is in words and is always >= 5. - * - * In practice len == 5, but this is not guaranteed. So this code does not - * attempt to use doubleword instructions. - */ -_GLOBAL(ip_fast_csum) - lwz r0,0(r3) - lwzu r5,4(r3) - addic. r4,r4,-2 - addc r0,r0,r5 - mtctr r4 - blelr- -1: lwzu r4,4(r3) - adde r0,r0,r4 - bdnz 1b - addze r0,r0 /* add in final carry */ - rldicl r4,r0,32,0 /* fold two 32-bit halves together */ - add r0,r0,r4 - srdi r0,r0,32 - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* - * Compute checksum of TCP or UDP pseudo-header: - * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) - * No real gain trying to do this specially for 64 bit, but - * the 32 bit addition may spill into the upper bits of - * the doubleword so we still must fold it down from 64. - */ -_GLOBAL(csum_tcpudp_magic) - rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ - addc r0,r3,r4 /* add 4 32-bit words together */ - adde r0,r0,r5 - adde r0,r0,r7 - rldicl r4,r0,32,0 /* fold 64 bit value */ - add r0,r4,r0 - srdi r0,r0,32 - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - -/* - * Computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit). - * - * This code assumes at least halfword alignment, though the length - * can be any number of bytes. The sum is accumulated in r5. - * - * csum_partial(r3=buff, r4=len, r5=sum) - */ -_GLOBAL(csum_partial) - subi r3,r3,8 /* we'll offset by 8 for the loads */ - srdi. r6,r4,3 /* divide by 8 for doubleword count */ - addic r5,r5,0 /* clear carry */ - beq 3f /* if we're doing < 8 bytes */ - andi. r0,r3,2 /* aligned on a word boundary already? */ - beq+ 1f - lhz r6,8(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r5,r5,r6 - srdi. r6,r4,3 /* recompute number of doublewords */ - beq 3f /* any left? */ -1: mtctr r6 -2: ldu r6,8(r3) /* main sum loop */ - adde r5,r5,r6 - bdnz 2b - andi. r4,r4,7 /* compute bytes left to sum after doublewords */ -3: cmpwi 0,r4,4 /* is at least a full word left? */ - blt 4f - lwz r6,8(r3) /* sum this word */ - addi r3,r3,4 - subi r4,r4,4 - adde r5,r5,r6 -4: cmpwi 0,r4,2 /* is at least a halfword left? */ - blt+ 5f - lhz r6,8(r3) /* sum this halfword */ - addi r3,r3,2 - subi r4,r4,2 - adde r5,r5,r6 -5: cmpwi 0,r4,1 /* is at least a byte left? */ - bne+ 6f - lbz r6,8(r3) /* sum this byte */ - slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ - adde r5,r5,r6 -6: addze r5,r5 /* add in final carry */ - rldicl r4,r5,32,0 /* fold two 32-bit halves together */ - add r3,r4,r5 - srdi r3,r3,32 - blr - -/* - * Computes the checksum of a memory block at src, length len, - * and adds in "sum" (32-bit), while copying the block to dst. - * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * This code needs to be reworked to take advantage of 64 bit sum+copy. - * However, due to tokenring halfword alignment problems this will be very - * tricky. For now we'll leave it until we instrument it somehow. - * - * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) - */ -_GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: mtctr r6 -82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ -92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ - adde r0,r0,r6 - bdnz 82b - andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) - addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) - addi r4,r4,2 - adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ - adde r0,r0,r6 -5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ - rldicl r4,r3,32,0 /* fold 64 bit value */ - add r3,r4,r3 - srdi r3,r3,32 - blr - -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ - - .globl src_error_1 -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 - srwi. r6,r5,2 - beq 3f - mtctr r6 - .globl src_error_2 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error - .globl src_error_3 -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b - .globl src_error -src_error: - cmpdi 0,r7,0 - beq 1f - li r6,-EFAULT - stw r6,0(r7) -1: addze r3,r0 - blr - - .globl dst_error -dst_error: - cmpdi 0,r8,0 - beq 1f - li r6,-EFAULT - stw r6,0(r8) -1: addze r3,r0 - blr - -.section __ex_table,"a" - .align 3 - .llong 81b,src_error_1 - .llong 91b,dst_error - .llong 82b,src_error_2 - .llong 92b,dst_error - .llong 83b,src_error_3 - .llong 93b,dst_error - .llong 84b,src_error_3 - .llong 94b,dst_error - .llong 95b,dst_error - .llong 96b,dst_error - .llong 97b,dst_error diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S new file mode 100644 index 00000000000..7874e8a8045 --- /dev/null +++ b/arch/powerpc/lib/checksum_32.S @@ -0,0 +1,225 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include +#include +#include +#include + + .text + +/* + * ip_fast_csum(buf, len) -- Optimized for IP header + * len is in words and is always >= 5. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(saddr, daddr, len, proto, sum) + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * csum_partial(buff, len, sum) + */ +_GLOBAL(csum_partial) + addic r0,r5,0 + subi r3,r3,4 + srwi. r6,r4,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r5,r3,2 /* Align buffer to longword boundary */ + beq+ 1f + lhz r5,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r0,r0,r5 + srwi. r6,r4,2 /* # words to do */ + beq 3f +1: mtctr r6 +2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ + adde r0,r0,r5 /* be unnecessary to unroll this loop */ + bdnz 2b + andi. r4,r4,3 +3: cmpwi 0,r4,2 + blt+ 4f + lhz r5,4(r3) + addi r3,r3,2 + subi r4,r4,2 + adde r0,r0,r5 +4: cmpwi 0,r4,1 + bne+ 5f + lbz r5,4(r3) + slwi r5,r5,8 /* Upper byte of word */ + adde r0,r0,r5 +5: addze r3,r0 /* add in final carry */ + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: srwi. r6,r5,4 /* # groups of 4 words to do */ + beq 10f + mtctr r6 +71: lwz r6,4(r3) +72: lwz r9,8(r3) +73: lwz r10,12(r3) +74: lwzu r11,16(r3) + adde r0,r0,r6 +75: stw r6,4(r4) + adde r0,r0,r9 +76: stw r9,8(r4) + adde r0,r0,r10 +77: stw r10,12(r4) + adde r0,r0,r11 +78: stwu r11,16(r4) + bdnz 71b +10: rlwinm. r6,r5,30,30,31 /* # words left to do */ + beq 13f + mtctr r6 +82: lwzu r9,4(r3) +92: stwu r9,4(r4) + adde r0,r0,r9 + bdnz 82b +13: andi. r5,r5,3 +3: cmpwi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpwi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry */ + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + +src_error_4: + mfctr r6 /* update # bytes remaining from ctr */ + rlwimi r5,r6,4,0,27 + b 79f +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 +79: srwi. r6,r5,2 + beq 3f + mtctr r6 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b +src_error: + cmpwi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + +dst_error: + cmpwi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .long 81b,src_error_1 + .long 91b,dst_error + .long 71b,src_error_4 + .long 72b,src_error_4 + .long 73b,src_error_4 + .long 74b,src_error_4 + .long 75b,dst_error + .long 76b,dst_error + .long 77b,dst_error + .long 78b,dst_error + .long 82b,src_error_2 + .long 92b,dst_error + .long 83b,src_error_3 + .long 93b,dst_error + .long 84b,src_error_3 + .long 94b,dst_error + .long 95b,dst_error + .long 96b,dst_error + .long 97b,dst_error diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S new file mode 100644 index 00000000000..ef96c6c58ef --- /dev/null +++ b/arch/powerpc/lib/checksum_64.S @@ -0,0 +1,229 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include +#include +#include +#include + +/* + * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header + * len is in words and is always >= 5. + * + * In practice len == 5, but this is not guaranteed. So this code does not + * attempt to use doubleword instructions. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32-bit halves together */ + add r0,r0,r4 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) + * No real gain trying to do this specially for 64 bit, but + * the 32 bit addition may spill into the upper bits of + * the doubleword so we still must fold it down from 64. + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + rldicl r4,r0,32,0 /* fold 64 bit value */ + add r0,r4,r0 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * This code assumes at least halfword alignment, though the length + * can be any number of bytes. The sum is accumulated in r5. + * + * csum_partial(r3=buff, r4=len, r5=sum) + */ +_GLOBAL(csum_partial) + subi r3,r3,8 /* we'll offset by 8 for the loads */ + srdi. r6,r4,3 /* divide by 8 for doubleword count */ + addic r5,r5,0 /* clear carry */ + beq 3f /* if we're doing < 8 bytes */ + andi. r0,r3,2 /* aligned on a word boundary already? */ + beq+ 1f + lhz r6,8(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r5,r5,r6 + srdi. r6,r4,3 /* recompute number of doublewords */ + beq 3f /* any left? */ +1: mtctr r6 +2: ldu r6,8(r3) /* main sum loop */ + adde r5,r5,r6 + bdnz 2b + andi. r4,r4,7 /* compute bytes left to sum after doublewords */ +3: cmpwi 0,r4,4 /* is at least a full word left? */ + blt 4f + lwz r6,8(r3) /* sum this word */ + addi r3,r3,4 + subi r4,r4,4 + adde r5,r5,r6 +4: cmpwi 0,r4,2 /* is at least a halfword left? */ + blt+ 5f + lhz r6,8(r3) /* sum this halfword */ + addi r3,r3,2 + subi r4,r4,2 + adde r5,r5,r6 +5: cmpwi 0,r4,1 /* is at least a byte left? */ + bne+ 6f + lbz r6,8(r3) /* sum this byte */ + slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ + adde r5,r5,r6 +6: addze r5,r5 /* add in final carry */ + rldicl r4,r5,32,0 /* fold two 32-bit halves together */ + add r3,r4,r5 + srdi r3,r3,32 + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * This code needs to be reworked to take advantage of 64 bit sum+copy. + * However, due to tokenring halfword alignment problems this will be very + * tricky. For now we'll leave it until we instrument it somehow. + * + * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: mtctr r6 +82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ +92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ + adde r0,r0,r6 + bdnz 82b + andi. r5,r5,3 +3: cmpwi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpwi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ + rldicl r4,r3,32,0 /* fold 64 bit value */ + add r3,r4,r3 + srdi r3,r3,32 + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + + .globl src_error_1 +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 + srwi. r6,r5,2 + beq 3f + mtctr r6 + .globl src_error_2 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error + .globl src_error_3 +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b + .globl src_error +src_error: + cmpdi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + + .globl dst_error +dst_error: + cmpdi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .align 3 + .llong 81b,src_error_1 + .llong 91b,dst_error + .llong 82b,src_error_2 + .llong 92b,dst_error + .llong 83b,src_error_3 + .llong 93b,dst_error + .llong 84b,src_error_3 + .llong 94b,dst_error + .llong 95b,dst_error + .llong 96b,dst_error + .llong 97b,dst_error diff --git a/arch/powerpc/lib/copy32.S b/arch/powerpc/lib/copy32.S deleted file mode 100644 index 420a912198a..00000000000 --- a/arch/powerpc/lib/copy32.S +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Memory copy functions for 32-bit PowerPC. - * - * Copyright (C) 1996-2005 Paul Mackerras. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include - -#define COPY_16_BYTES \ - lwz r7,4(r4); \ - lwz r8,8(r4); \ - lwz r9,12(r4); \ - lwzu r10,16(r4); \ - stw r7,4(r6); \ - stw r8,8(r6); \ - stw r9,12(r6); \ - stwu r10,16(r6) - -#define COPY_16_BYTES_WITHEX(n) \ -8 ## n ## 0: \ - lwz r7,4(r4); \ -8 ## n ## 1: \ - lwz r8,8(r4); \ -8 ## n ## 2: \ - lwz r9,12(r4); \ -8 ## n ## 3: \ - lwzu r10,16(r4); \ -8 ## n ## 4: \ - stw r7,4(r6); \ -8 ## n ## 5: \ - stw r8,8(r6); \ -8 ## n ## 6: \ - stw r9,12(r6); \ -8 ## n ## 7: \ - stwu r10,16(r6) - -#define COPY_16_BYTES_EXCODE(n) \ -9 ## n ## 0: \ - addi r5,r5,-(16 * n); \ - b 104f; \ -9 ## n ## 1: \ - addi r5,r5,-(16 * n); \ - b 105f; \ -.section __ex_table,"a"; \ - .align 2; \ - .long 8 ## n ## 0b,9 ## n ## 0b; \ - .long 8 ## n ## 1b,9 ## n ## 0b; \ - .long 8 ## n ## 2b,9 ## n ## 0b; \ - .long 8 ## n ## 3b,9 ## n ## 0b; \ - .long 8 ## n ## 4b,9 ## n ## 1b; \ - .long 8 ## n ## 5b,9 ## n ## 1b; \ - .long 8 ## n ## 6b,9 ## n ## 1b; \ - .long 8 ## n ## 7b,9 ## n ## 1b; \ - .text - - .text - .stabs "arch/powerpc/lib/",N_SO,0,0,0f - .stabs "copy32.S",N_SO,0,0,0f -0: - -CACHELINE_BYTES = L1_CACHE_LINE_SIZE -LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE -CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1) - -/* - * Use dcbz on the complete cache lines in the destination - * to set them to zero. This requires that the destination - * area is cacheable. -- paulus - */ -_GLOBAL(cacheable_memzero) - mr r5,r4 - li r4,0 - addi r6,r3,-4 - cmplwi 0,r5,4 - blt 7f - stwu r4,4(r6) - beqlr - andi. r0,r6,3 - add r5,r0,r5 - subf r6,r0,r6 - clrlwi r7,r6,32-LG_CACHELINE_BYTES - add r8,r7,r5 - srwi r9,r8,LG_CACHELINE_BYTES - addic. r9,r9,-1 /* total number of complete cachelines */ - ble 2f - xori r0,r7,CACHELINE_MASK & ~3 - srwi. r0,r0,2 - beq 3f - mtctr r0 -4: stwu r4,4(r6) - bdnz 4b -3: mtctr r9 - li r7,4 -#if !defined(CONFIG_8xx) -10: dcbz r7,r6 -#else -10: stw r4, 4(r6) - stw r4, 8(r6) - stw r4, 12(r6) - stw r4, 16(r6) -#if CACHE_LINE_SIZE >= 32 - stw r4, 20(r6) - stw r4, 24(r6) - stw r4, 28(r6) - stw r4, 32(r6) -#endif /* CACHE_LINE_SIZE */ -#endif - addi r6,r6,CACHELINE_BYTES - bdnz 10b - clrlwi r5,r8,32-LG_CACHELINE_BYTES - addi r5,r5,4 -2: srwi r0,r5,2 - mtctr r0 - bdz 6f -1: stwu r4,4(r6) - bdnz 1b -6: andi. r5,r5,3 -7: cmpwi 0,r5,0 - beqlr - mtctr r5 - addi r6,r6,3 -8: stbu r4,1(r6) - bdnz 8b - blr - -_GLOBAL(memset) - rlwimi r4,r4,8,16,23 - rlwimi r4,r4,16,0,15 - addi r6,r3,-4 - cmplwi 0,r5,4 - blt 7f - stwu r4,4(r6) - beqlr - andi. r0,r6,3 - add r5,r0,r5 - subf r6,r0,r6 - srwi r0,r5,2 - mtctr r0 - bdz 6f -1: stwu r4,4(r6) - bdnz 1b -6: andi. r5,r5,3 -7: cmpwi 0,r5,0 - beqlr - mtctr r5 - addi r6,r6,3 -8: stbu r4,1(r6) - bdnz 8b - blr - -/* - * This version uses dcbz on the complete cache lines in the - * destination area to reduce memory traffic. This requires that - * the destination area is cacheable. - * We only use this version if the source and dest don't overlap. - * -- paulus. - */ -_GLOBAL(cacheable_memcpy) - add r7,r3,r5 /* test if the src & dst overlap */ - add r8,r4,r5 - cmplw 0,r4,r7 - cmplw 1,r3,r8 - crand 0,0,4 /* cr0.lt &= cr1.lt */ - blt memcpy /* if regions overlap */ - - addi r4,r4,-4 - addi r6,r3,-4 - neg r0,r3 - andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ - beq 58f - - cmplw 0,r5,r0 /* is this more than total to do? */ - blt 63f /* if not much to do */ - andi. r8,r0,3 /* get it word-aligned first */ - subf r5,r0,r5 - mtctr r8 - beq+ 61f -70: lbz r9,4(r4) /* do some bytes */ - stb r9,4(r6) - addi r4,r4,1 - addi r6,r6,1 - bdnz 70b -61: srwi. r0,r0,2 - mtctr r0 - beq 58f -72: lwzu r9,4(r4) /* do some words */ - stwu r9,4(r6) - bdnz 72b - -58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ - clrlwi r5,r5,32-LG_CACHELINE_BYTES - li r11,4 - mtctr r0 - beq 63f -53: -#if !defined(CONFIG_8xx) - dcbz r11,r6 -#endif - COPY_16_BYTES -#if L1_CACHE_LINE_SIZE >= 32 - COPY_16_BYTES -#if L1_CACHE_LINE_SIZE >= 64 - COPY_16_BYTES - COPY_16_BYTES -#if L1_CACHE_LINE_SIZE >= 128 - COPY_16_BYTES - COPY_16_BYTES - COPY_16_BYTES - COPY_16_BYTES -#endif -#endif -#endif - bdnz 53b - -63: srwi. r0,r5,2 - mtctr r0 - beq 64f -30: lwzu r0,4(r4) - stwu r0,4(r6) - bdnz 30b - -64: andi. r0,r5,3 - mtctr r0 - beq+ 65f -40: lbz r0,4(r4) - stb r0,4(r6) - addi r4,r4,1 - addi r6,r6,1 - bdnz 40b -65: blr - -_GLOBAL(memmove) - cmplw 0,r3,r4 - bgt backwards_memcpy - /* fall through */ - -_GLOBAL(memcpy) - srwi. r7,r5,3 - addi r6,r3,-4 - addi r4,r4,-4 - beq 2f /* if less than 8 bytes to do */ - andi. r0,r6,3 /* get dest word aligned */ - mtctr r7 - bne 5f -1: lwz r7,4(r4) - lwzu r8,8(r4) - stw r7,4(r6) - stwu r8,8(r6) - bdnz 1b - andi. r5,r5,7 -2: cmplwi 0,r5,4 - blt 3f - lwzu r0,4(r4) - addi r5,r5,-4 - stwu r0,4(r6) -3: cmpwi 0,r5,0 - beqlr - mtctr r5 - addi r4,r4,3 - addi r6,r6,3 -4: lbzu r0,1(r4) - stbu r0,1(r6) - bdnz 4b - blr -5: subfic r0,r0,4 - mtctr r0 -6: lbz r7,4(r4) - addi r4,r4,1 - stb r7,4(r6) - addi r6,r6,1 - bdnz 6b - subf r5,r0,r5 - rlwinm. r7,r5,32-3,3,31 - beq 2b - mtctr r7 - b 1b - -_GLOBAL(backwards_memcpy) - rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ - add r6,r3,r5 - add r4,r4,r5 - beq 2f - andi. r0,r6,3 - mtctr r7 - bne 5f -1: lwz r7,-4(r4) - lwzu r8,-8(r4) - stw r7,-4(r6) - stwu r8,-8(r6) - bdnz 1b - andi. r5,r5,7 -2: cmplwi 0,r5,4 - blt 3f - lwzu r0,-4(r4) - subi r5,r5,4 - stwu r0,-4(r6) -3: cmpwi 0,r5,0 - beqlr - mtctr r5 -4: lbzu r0,-1(r4) - stbu r0,-1(r6) - bdnz 4b - blr -5: mtctr r0 -6: lbzu r7,-1(r4) - stbu r7,-1(r6) - bdnz 6b - subf r5,r0,r5 - rlwinm. r7,r5,32-3,3,31 - beq 2b - mtctr r7 - b 1b - -_GLOBAL(__copy_tofrom_user) - addi r4,r4,-4 - addi r6,r3,-4 - neg r0,r3 - andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ - beq 58f - - cmplw 0,r5,r0 /* is this more than total to do? */ - blt 63f /* if not much to do */ - andi. r8,r0,3 /* get it word-aligned first */ - mtctr r8 - beq+ 61f -70: lbz r9,4(r4) /* do some bytes */ -71: stb r9,4(r6) - addi r4,r4,1 - addi r6,r6,1 - bdnz 70b -61: subf r5,r0,r5 - srwi. r0,r0,2 - mtctr r0 - beq 58f -72: lwzu r9,4(r4) /* do some words */ -73: stwu r9,4(r6) - bdnz 72b - - .section __ex_table,"a" - .align 2 - .long 70b,100f - .long 71b,101f - .long 72b,102f - .long 73b,103f - .text - -58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ - clrlwi r5,r5,32-LG_CACHELINE_BYTES - li r11,4 - beq 63f - -#ifdef CONFIG_8xx - /* Don't use prefetch on 8xx */ - mtctr r0 - li r0,0 -53: COPY_16_BYTES_WITHEX(0) - bdnz 53b - -#else /* not CONFIG_8xx */ - /* Here we decide how far ahead to prefetch the source */ - li r3,4 - cmpwi r0,1 - li r7,0 - ble 114f - li r7,1 -#if MAX_COPY_PREFETCH > 1 - /* Heuristically, for large transfers we prefetch - MAX_COPY_PREFETCH cachelines ahead. For small transfers - we prefetch 1 cacheline ahead. */ - cmpwi r0,MAX_COPY_PREFETCH - ble 112f - li r7,MAX_COPY_PREFETCH -112: mtctr r7 -111: dcbt r3,r4 - addi r3,r3,CACHELINE_BYTES - bdnz 111b -#else - dcbt r3,r4 - addi r3,r3,CACHELINE_BYTES -#endif /* MAX_COPY_PREFETCH > 1 */ - -114: subf r8,r7,r0 - mr r0,r7 - mtctr r8 - -53: dcbt r3,r4 -54: dcbz r11,r6 - .section __ex_table,"a" - .align 2 - .long 54b,105f - .text -/* the main body of the cacheline loop */ - COPY_16_BYTES_WITHEX(0) -#if L1_CACHE_LINE_SIZE >= 32 - COPY_16_BYTES_WITHEX(1) -#if L1_CACHE_LINE_SIZE >= 64 - COPY_16_BYTES_WITHEX(2) - COPY_16_BYTES_WITHEX(3) -#if L1_CACHE_LINE_SIZE >= 128 - COPY_16_BYTES_WITHEX(4) - COPY_16_BYTES_WITHEX(5) - COPY_16_BYTES_WITHEX(6) - COPY_16_BYTES_WITHEX(7) -#endif -#endif -#endif - bdnz 53b - cmpwi r0,0 - li r3,4 - li r7,0 - bne 114b -#endif /* CONFIG_8xx */ - -63: srwi. r0,r5,2 - mtctr r0 - beq 64f -30: lwzu r0,4(r4) -31: stwu r0,4(r6) - bdnz 30b - -64: andi. r0,r5,3 - mtctr r0 - beq+ 65f -40: lbz r0,4(r4) -41: stb r0,4(r6) - addi r4,r4,1 - addi r6,r6,1 - bdnz 40b -65: li r3,0 - blr - -/* read fault, initial single-byte copy */ -100: li r9,0 - b 90f -/* write fault, initial single-byte copy */ -101: li r9,1 -90: subf r5,r8,r5 - li r3,0 - b 99f -/* read fault, initial word copy */ -102: li r9,0 - b 91f -/* write fault, initial word copy */ -103: li r9,1 -91: li r3,2 - b 99f - -/* - * this stuff handles faults in the cacheline loop and branches to either - * 104f (if in read part) or 105f (if in write part), after updating r5 - */ - COPY_16_BYTES_EXCODE(0) -#if L1_CACHE_LINE_SIZE >= 32 - COPY_16_BYTES_EXCODE(1) -#if L1_CACHE_LINE_SIZE >= 64 - COPY_16_BYTES_EXCODE(2) - COPY_16_BYTES_EXCODE(3) -#if L1_CACHE_LINE_SIZE >= 128 - COPY_16_BYTES_EXCODE(4) - COPY_16_BYTES_EXCODE(5) - COPY_16_BYTES_EXCODE(6) - COPY_16_BYTES_EXCODE(7) -#endif -#endif -#endif - -/* read fault in cacheline loop */ -104: li r9,0 - b 92f -/* fault on dcbz (effectively a write fault) */ -/* or write fault in cacheline loop */ -105: li r9,1 -92: li r3,LG_CACHELINE_BYTES - mfctr r8 - add r0,r0,r8 - b 106f -/* read fault in final word loop */ -108: li r9,0 - b 93f -/* write fault in final word loop */ -109: li r9,1 -93: andi. r5,r5,3 - li r3,2 - b 99f -/* read fault in final byte loop */ -110: li r9,0 - b 94f -/* write fault in final byte loop */ -111: li r9,1 -94: li r5,0 - li r3,0 -/* - * At this stage the number of bytes not copied is - * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. - */ -99: mfctr r0 -106: slw r3,r0,r3 - add. r3,r3,r5 - beq 120f /* shouldn't happen */ - cmpwi 0,r9,0 - bne 120f -/* for a read fault, first try to continue the copy one byte at a time */ - mtctr r3 -130: lbz r0,4(r4) -131: stb r0,4(r6) - addi r4,r4,1 - addi r6,r6,1 - bdnz 130b -/* then clear out the destination: r3 bytes starting at 4(r6) */ -132: mfctr r3 - srwi. r0,r3,2 - li r9,0 - mtctr r0 - beq 113f -112: stwu r9,4(r6) - bdnz 112b -113: andi. r0,r3,3 - mtctr r0 - beq 120f -114: stb r9,4(r6) - addi r6,r6,1 - bdnz 114b -120: blr - - .section __ex_table,"a" - .align 2 - .long 30b,108b - .long 31b,109b - .long 40b,110b - .long 41b,111b - .long 130b,132b - .long 131b,120b - .long 112b,120b - .long 114b,120b - .text diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S new file mode 100644 index 00000000000..420a912198a --- /dev/null +++ b/arch/powerpc/lib/copy_32.S @@ -0,0 +1,543 @@ +/* + * Memory copy functions for 32-bit PowerPC. + * + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include + +#define COPY_16_BYTES \ + lwz r7,4(r4); \ + lwz r8,8(r4); \ + lwz r9,12(r4); \ + lwzu r10,16(r4); \ + stw r7,4(r6); \ + stw r8,8(r6); \ + stw r9,12(r6); \ + stwu r10,16(r6) + +#define COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ +8 ## n ## 5: \ + stw r8,8(r6); \ +8 ## n ## 6: \ + stw r9,12(r6); \ +8 ## n ## 7: \ + stwu r10,16(r6) + +#define COPY_16_BYTES_EXCODE(n) \ +9 ## n ## 0: \ + addi r5,r5,-(16 * n); \ + b 104f; \ +9 ## n ## 1: \ + addi r5,r5,-(16 * n); \ + b 105f; \ +.section __ex_table,"a"; \ + .align 2; \ + .long 8 ## n ## 0b,9 ## n ## 0b; \ + .long 8 ## n ## 1b,9 ## n ## 0b; \ + .long 8 ## n ## 2b,9 ## n ## 0b; \ + .long 8 ## n ## 3b,9 ## n ## 0b; \ + .long 8 ## n ## 4b,9 ## n ## 1b; \ + .long 8 ## n ## 5b,9 ## n ## 1b; \ + .long 8 ## n ## 6b,9 ## n ## 1b; \ + .long 8 ## n ## 7b,9 ## n ## 1b; \ + .text + + .text + .stabs "arch/powerpc/lib/",N_SO,0,0,0f + .stabs "copy32.S",N_SO,0,0,0f +0: + +CACHELINE_BYTES = L1_CACHE_LINE_SIZE +LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE +CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1) + +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. -- paulus + */ +_GLOBAL(cacheable_memzero) + mr r5,r4 + li r4,0 + addi r6,r3,-4 + cmplwi 0,r5,4 + blt 7f + stwu r4,4(r6) + beqlr + andi. r0,r6,3 + add r5,r0,r5 + subf r6,r0,r6 + clrlwi r7,r6,32-LG_CACHELINE_BYTES + add r8,r7,r5 + srwi r9,r8,LG_CACHELINE_BYTES + addic. r9,r9,-1 /* total number of complete cachelines */ + ble 2f + xori r0,r7,CACHELINE_MASK & ~3 + srwi. r0,r0,2 + beq 3f + mtctr r0 +4: stwu r4,4(r6) + bdnz 4b +3: mtctr r9 + li r7,4 +#if !defined(CONFIG_8xx) +10: dcbz r7,r6 +#else +10: stw r4, 4(r6) + stw r4, 8(r6) + stw r4, 12(r6) + stw r4, 16(r6) +#if CACHE_LINE_SIZE >= 32 + stw r4, 20(r6) + stw r4, 24(r6) + stw r4, 28(r6) + stw r4, 32(r6) +#endif /* CACHE_LINE_SIZE */ +#endif + addi r6,r6,CACHELINE_BYTES + bdnz 10b + clrlwi r5,r8,32-LG_CACHELINE_BYTES + addi r5,r5,4 +2: srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +_GLOBAL(memset) + rlwimi r4,r4,8,16,23 + rlwimi r4,r4,16,0,15 + addi r6,r3,-4 + cmplwi 0,r5,4 + blt 7f + stwu r4,4(r6) + beqlr + andi. r0,r6,3 + add r5,r0,r5 + subf r6,r0,r6 + srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +/* + * This version uses dcbz on the complete cache lines in the + * destination area to reduce memory traffic. This requires that + * the destination area is cacheable. + * We only use this version if the source and dest don't overlap. + * -- paulus. + */ +_GLOBAL(cacheable_memcpy) + add r7,r3,r5 /* test if the src & dst overlap */ + add r8,r4,r5 + cmplw 0,r4,r7 + cmplw 1,r3,r8 + crand 0,0,4 /* cr0.lt &= cr1.lt */ + blt memcpy /* if regions overlap */ + + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + subf r5,r0,r5 + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ + stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + mtctr r0 + beq 63f +53: +#if !defined(CONFIG_8xx) + dcbz r11,r6 +#endif + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES + COPY_16_BYTES +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES + COPY_16_BYTES +#endif +#endif +#endif + bdnz 53b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) + stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: blr + +_GLOBAL(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + /* fall through */ + +_GLOBAL(memcpy) + srwi. r7,r5,3 + addi r6,r3,-4 + addi r4,r4,-4 + beq 2f /* if less than 8 bytes to do */ + andi. r0,r6,3 /* get dest word aligned */ + mtctr r7 + bne 5f +1: lwz r7,4(r4) + lwzu r8,8(r4) + stw r7,4(r6) + stwu r8,8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,4(r4) + addi r5,r5,-4 + stwu r0,4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r4,r4,3 + addi r6,r6,3 +4: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 4b + blr +5: subfic r0,r0,4 + mtctr r0 +6: lbz r7,4(r4) + addi r4,r4,1 + stb r7,4(r6) + addi r6,r6,1 + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(__copy_tofrom_user) + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ +71: stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ +73: stwu r9,4(r6) + bdnz 72b + + .section __ex_table,"a" + .align 2 + .long 70b,100f + .long 71b,101f + .long 72b,102f + .long 73b,103f + .text + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + +#ifdef CONFIG_8xx + /* Don't use prefetch on 8xx */ + mtctr r0 + li r0,0 +53: COPY_16_BYTES_WITHEX(0) + bdnz 53b + +#else /* not CONFIG_8xx */ + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 + .section __ex_table,"a" + .align 2 + .long 54b,105f + .text +/* the main body of the cacheline loop */ + COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES_WITHEX(2) + COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES_WITHEX(4) + COPY_16_BYTES_WITHEX(5) + COPY_16_BYTES_WITHEX(6) + COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b +#endif /* CONFIG_8xx */ + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) +41: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: li r3,0 + blr + +/* read fault, initial single-byte copy */ +100: li r9,0 + b 90f +/* write fault, initial single-byte copy */ +101: li r9,1 +90: subf r5,r8,r5 + li r3,0 + b 99f +/* read fault, initial word copy */ +102: li r9,0 + b 91f +/* write fault, initial word copy */ +103: li r9,1 +91: li r3,2 + b 99f + +/* + * this stuff handles faults in the cacheline loop and branches to either + * 104f (if in read part) or 105f (if in write part), after updating r5 + */ + COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_LINE_SIZE >= 32 + COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_LINE_SIZE >= 64 + COPY_16_BYTES_EXCODE(2) + COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_LINE_SIZE >= 128 + COPY_16_BYTES_EXCODE(4) + COPY_16_BYTES_EXCODE(5) + COPY_16_BYTES_EXCODE(6) + COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + +/* read fault in cacheline loop */ +104: li r9,0 + b 92f +/* fault on dcbz (effectively a write fault) */ +/* or write fault in cacheline loop */ +105: li r9,1 +92: li r3,LG_CACHELINE_BYTES + mfctr r8 + add r0,r0,r8 + b 106f +/* read fault in final word loop */ +108: li r9,0 + b 93f +/* write fault in final word loop */ +109: li r9,1 +93: andi. r5,r5,3 + li r3,2 + b 99f +/* read fault in final byte loop */ +110: li r9,0 + b 94f +/* write fault in final byte loop */ +111: li r9,1 +94: li r5,0 + li r3,0 +/* + * At this stage the number of bytes not copied is + * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. + */ +99: mfctr r0 +106: slw r3,r0,r3 + add. r3,r3,r5 + beq 120f /* shouldn't happen */ + cmpwi 0,r9,0 + bne 120f +/* for a read fault, first try to continue the copy one byte at a time */ + mtctr r3 +130: lbz r0,4(r4) +131: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 130b +/* then clear out the destination: r3 bytes starting at 4(r6) */ +132: mfctr r3 + srwi. r0,r3,2 + li r9,0 + mtctr r0 + beq 113f +112: stwu r9,4(r6) + bdnz 112b +113: andi. r0,r3,3 + mtctr r0 + beq 120f +114: stb r9,4(r6) + addi r6,r6,1 + bdnz 114b +120: blr + + .section __ex_table,"a" + .align 2 + .long 30b,108b + .long 31b,109b + .long 40b,110b + .long 41b,111b + .long 130b,132b + .long 131b,120b + .long 112b,120b + .long 114b,120b + .text diff --git a/arch/powerpc/lib/copypage.S b/arch/powerpc/lib/copypage.S deleted file mode 100644 index 733d61618bb..00000000000 --- a/arch/powerpc/lib/copypage.S +++ /dev/null @@ -1,121 +0,0 @@ -/* - * arch/ppc64/lib/copypage.S - * - * Copyright (C) 2002 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include - -_GLOBAL(copy_page) - std r31,-8(1) - std r30,-16(1) - std r29,-24(1) - std r28,-32(1) - std r27,-40(1) - std r26,-48(1) - std r25,-56(1) - std r24,-64(1) - std r23,-72(1) - std r22,-80(1) - std r21,-88(1) - std r20,-96(1) - li r5,4096/32 - 1 - addi r3,r3,-8 - li r12,5 -0: addi r5,r5,-24 - mtctr r12 - ld r22,640(4) - ld r21,512(4) - ld r20,384(4) - ld r11,256(4) - ld r9,128(4) - ld r7,0(4) - ld r25,648(4) - ld r24,520(4) - ld r23,392(4) - ld r10,264(4) - ld r8,136(4) - ldu r6,8(4) - cmpwi r5,24 -1: std r22,648(3) - std r21,520(3) - std r20,392(3) - std r11,264(3) - std r9,136(3) - std r7,8(3) - ld r28,648(4) - ld r27,520(4) - ld r26,392(4) - ld r31,264(4) - ld r30,136(4) - ld r29,8(4) - std r25,656(3) - std r24,528(3) - std r23,400(3) - std r10,272(3) - std r8,144(3) - std r6,16(3) - ld r22,656(4) - ld r21,528(4) - ld r20,400(4) - ld r11,272(4) - ld r9,144(4) - ld r7,16(4) - std r28,664(3) - std r27,536(3) - std r26,408(3) - std r31,280(3) - std r30,152(3) - stdu r29,24(3) - ld r25,664(4) - ld r24,536(4) - ld r23,408(4) - ld r10,280(4) - ld r8,152(4) - ldu r6,24(4) - bdnz 1b - std r22,648(3) - std r21,520(3) - std r20,392(3) - std r11,264(3) - std r9,136(3) - std r7,8(3) - addi r4,r4,640 - addi r3,r3,648 - bge 0b - mtctr r5 - ld r7,0(4) - ld r8,8(4) - ldu r9,16(4) -3: ld r10,8(4) - std r7,8(3) - ld r7,16(4) - std r8,16(3) - ld r8,24(4) - std r9,24(3) - ldu r9,32(4) - stdu r10,32(3) - bdnz 3b -4: ld r10,8(4) - std r7,8(3) - std r8,16(3) - std r9,24(3) - std r10,32(3) -9: ld r20,-96(1) - ld r21,-88(1) - ld r22,-80(1) - ld r23,-72(1) - ld r24,-64(1) - ld r25,-56(1) - ld r26,-48(1) - ld r27,-40(1) - ld r28,-32(1) - ld r29,-24(1) - ld r30,-16(1) - ld r31,-8(1) - blr diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S new file mode 100644 index 00000000000..733d61618bb --- /dev/null +++ b/arch/powerpc/lib/copypage_64.S @@ -0,0 +1,121 @@ +/* + * arch/ppc64/lib/copypage.S + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +_GLOBAL(copy_page) + std r31,-8(1) + std r30,-16(1) + std r29,-24(1) + std r28,-32(1) + std r27,-40(1) + std r26,-48(1) + std r25,-56(1) + std r24,-64(1) + std r23,-72(1) + std r22,-80(1) + std r21,-88(1) + std r20,-96(1) + li r5,4096/32 - 1 + addi r3,r3,-8 + li r12,5 +0: addi r5,r5,-24 + mtctr r12 + ld r22,640(4) + ld r21,512(4) + ld r20,384(4) + ld r11,256(4) + ld r9,128(4) + ld r7,0(4) + ld r25,648(4) + ld r24,520(4) + ld r23,392(4) + ld r10,264(4) + ld r8,136(4) + ldu r6,8(4) + cmpwi r5,24 +1: std r22,648(3) + std r21,520(3) + std r20,392(3) + std r11,264(3) + std r9,136(3) + std r7,8(3) + ld r28,648(4) + ld r27,520(4) + ld r26,392(4) + ld r31,264(4) + ld r30,136(4) + ld r29,8(4) + std r25,656(3) + std r24,528(3) + std r23,400(3) + std r10,272(3) + std r8,144(3) + std r6,16(3) + ld r22,656(4) + ld r21,528(4) + ld r20,400(4) + ld r11,272(4) + ld r9,144(4) + ld r7,16(4) + std r28,664(3) + std r27,536(3) + std r26,408(3) + std r31,280(3) + std r30,152(3) + stdu r29,24(3) + ld r25,664(4) + ld r24,536(4) + ld r23,408(4) + ld r10,280(4) + ld r8,152(4) + ldu r6,24(4) + bdnz 1b + std r22,648(3) + std r21,520(3) + std r20,392(3) + std r11,264(3) + std r9,136(3) + std r7,8(3) + addi r4,r4,640 + addi r3,r3,648 + bge 0b + mtctr r5 + ld r7,0(4) + ld r8,8(4) + ldu r9,16(4) +3: ld r10,8(4) + std r7,8(3) + ld r7,16(4) + std r8,16(3) + ld r8,24(4) + std r9,24(3) + ldu r9,32(4) + stdu r10,32(3) + bdnz 3b +4: ld r10,8(4) + std r7,8(3) + std r8,16(3) + std r9,24(3) + std r10,32(3) +9: ld r20,-96(1) + ld r21,-88(1) + ld r22,-80(1) + ld r23,-72(1) + ld r24,-64(1) + ld r25,-56(1) + ld r26,-48(1) + ld r27,-40(1) + ld r28,-32(1) + ld r29,-24(1) + ld r30,-16(1) + ld r31,-8(1) + blr diff --git a/arch/powerpc/lib/copyuser.S b/arch/powerpc/lib/copyuser.S deleted file mode 100644 index a0b3fbbd6fb..00000000000 --- a/arch/powerpc/lib/copyuser.S +++ /dev/null @@ -1,576 +0,0 @@ -/* - * arch/ppc64/lib/copyuser.S - * - * Copyright (C) 2002 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include - - .align 7 -_GLOBAL(__copy_tofrom_user) - /* first check for a whole page copy on a page boundary */ - cmpldi cr1,r5,16 - cmpdi cr6,r5,4096 - or r0,r3,r4 - neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ - andi. r0,r0,4095 - std r3,-24(r1) - crand cr0*4+2,cr0*4+2,cr6*4+2 - std r4,-16(r1) - std r5,-8(r1) - dcbt 0,r4 - beq .Lcopy_page - andi. r6,r6,7 - mtcrf 0x01,r5 - blt cr1,.Lshort_copy - bne .Ldst_unaligned -.Ldst_aligned: - andi. r0,r4,7 - addi r3,r3,-16 - bne .Lsrc_unaligned - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) - bdnz 21b -72: std r8,8(r3) - beq+ 3f - addi r3,r3,16 -23: ld r9,8(r4) -.Ldo_tail: - bf cr7*4+1,1f - rotldi r9,r9,32 -73: stw r9,0(r3) - addi r3,r3,4 -1: bf cr7*4+2,2f - rotldi r9,r9,16 -74: sth r9,0(r3) - addi r3,r3,2 -2: bf cr7*4+3,3f - rotldi r9,r9,8 -75: stb r9,0(r3) -3: li r3,0 - blr - -.Lsrc_unaligned: - srdi r6,r5,3 - addi r5,r5,-16 - subf r4,r0,r4 - srdi r7,r5,4 - sldi r10,r0,3 - cmpldi cr6,r6,3 - andi. r5,r5,7 - mtctr r7 - subfic r11,r10,64 - add r5,r5,r0 - bt cr7*4+0,28f - -24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ -25: ld r0,8(r4) - sld r6,r9,r10 -26: ldu r9,16(r4) - srd r7,r0,r11 - sld r8,r0,r10 - or r7,r7,r6 - blt cr6,79f -27: ld r0,8(r4) - b 2f - -28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ -29: ldu r9,8(r4) - sld r8,r0,r10 - addi r3,r3,-8 - blt cr6,5f -30: ld r0,8(r4) - srd r12,r9,r11 - sld r6,r9,r10 -31: ldu r9,16(r4) - or r12,r8,r12 - srd r7,r0,r11 - sld r8,r0,r10 - addi r3,r3,16 - beq cr6,78f - -1: or r7,r7,r6 -32: ld r0,8(r4) -76: std r12,8(r3) -2: srd r12,r9,r11 - sld r6,r9,r10 -33: ldu r9,16(r4) - or r12,r8,r12 -77: stdu r7,16(r3) - srd r7,r0,r11 - sld r8,r0,r10 - bdnz 1b - -78: std r12,8(r3) - or r7,r7,r6 -79: std r7,16(r3) -5: srd r12,r9,r11 - or r12,r8,r12 -80: std r12,24(r3) - bne 6f - li r3,0 - blr -6: cmpwi cr1,r5,8 - addi r3,r3,32 - sld r9,r9,r10 - ble cr1,.Ldo_tail -34: ld r0,8(r4) - srd r7,r0,r11 - or r9,r7,r9 - b .Ldo_tail - -.Ldst_unaligned: - mtcrf 0x01,r6 /* put #bytes to 8B bdry into cr7 */ - subf r5,r6,r5 - li r7,0 - cmpldi r1,r5,16 - bf cr7*4+3,1f -35: lbz r0,0(r4) -81: stb r0,0(r3) - addi r7,r7,1 -1: bf cr7*4+2,2f -36: lhzx r0,r7,r4 -82: sthx r0,r7,r3 - addi r7,r7,2 -2: bf cr7*4+1,3f -37: lwzx r0,r7,r4 -83: stwx r0,r7,r3 -3: mtcrf 0x01,r5 - add r4,r6,r4 - add r3,r6,r3 - b .Ldst_aligned - -.Lshort_copy: - bf cr7*4+0,1f -38: lwz r0,0(r4) -39: lwz r9,4(r4) - addi r4,r4,8 -84: stw r0,0(r3) -85: stw r9,4(r3) - addi r3,r3,8 -1: bf cr7*4+1,2f -40: lwz r0,0(r4) - addi r4,r4,4 -86: stw r0,0(r3) - addi r3,r3,4 -2: bf cr7*4+2,3f -41: lhz r0,0(r4) - addi r4,r4,2 -87: sth r0,0(r3) - addi r3,r3,2 -3: bf cr7*4+3,4f -42: lbz r0,0(r4) -88: stb r0,0(r3) -4: li r3,0 - blr - -/* - * exception handlers follow - * we have to return the number of bytes not copied - * for an exception on a load, we set the rest of the destination to 0 - */ - -136: -137: - add r3,r3,r7 - b 1f -130: -131: - addi r3,r3,8 -120: -122: -124: -125: -126: -127: -128: -129: -133: - addi r3,r3,8 -121: -132: - addi r3,r3,8 -123: -134: -135: -138: -139: -140: -141: -142: - -/* - * here we have had a fault on a load and r3 points to the first - * unmodified byte of the destination - */ -1: ld r6,-24(r1) - ld r4,-16(r1) - ld r5,-8(r1) - subf r6,r6,r3 - add r4,r4,r6 - subf r5,r6,r5 /* #bytes left to go */ - -/* - * first see if we can copy any more bytes before hitting another exception - */ - mtctr r5 -43: lbz r0,0(r4) - addi r4,r4,1 -89: stb r0,0(r3) - addi r3,r3,1 - bdnz 43b - li r3,0 /* huh? all copied successfully this time? */ - blr - -/* - * here we have trapped again, need to clear ctr bytes starting at r3 - */ -143: mfctr r5 - li r0,0 - mr r4,r3 - mr r3,r5 /* return the number of bytes not copied */ -1: andi. r9,r4,7 - beq 3f -90: stb r0,0(r4) - addic. r5,r5,-1 - addi r4,r4,1 - bne 1b - blr -3: cmpldi cr1,r5,8 - srdi r9,r5,3 - andi. r5,r5,7 - blt cr1,93f - mtctr r9 -91: std r0,0(r4) - addi r4,r4,8 - bdnz 91b -93: beqlr - mtctr r5 -92: stb r0,0(r4) - addi r4,r4,1 - bdnz 92b - blr - -/* - * exception handlers for stores: we just need to work - * out how many bytes weren't copied - */ -182: -183: - add r3,r3,r7 - b 1f -180: - addi r3,r3,8 -171: -177: - addi r3,r3,8 -170: -172: -176: -178: - addi r3,r3,4 -185: - addi r3,r3,4 -173: -174: -175: -179: -181: -184: -186: -187: -188: -189: -1: - ld r6,-24(r1) - ld r5,-8(r1) - add r6,r6,r5 - subf r3,r3,r6 /* #bytes not copied */ -190: -191: -192: - blr /* #bytes not copied in r3 */ - - .section __ex_table,"a" - .align 3 - .llong 20b,120b - .llong 21b,121b - .llong 70b,170b - .llong 22b,122b - .llong 71b,171b - .llong 72b,172b - .llong 23b,123b - .llong 73b,173b - .llong 74b,174b - .llong 75b,175b - .llong 24b,124b - .llong 25b,125b - .llong 26b,126b - .llong 27b,127b - .llong 28b,128b - .llong 29b,129b - .llong 30b,130b - .llong 31b,131b - .llong 32b,132b - .llong 76b,176b - .llong 33b,133b - .llong 77b,177b - .llong 78b,178b - .llong 79b,179b - .llong 80b,180b - .llong 34b,134b - .llong 35b,135b - .llong 81b,181b - .llong 36b,136b - .llong 82b,182b - .llong 37b,137b - .llong 83b,183b - .llong 38b,138b - .llong 39b,139b - .llong 84b,184b - .llong 85b,185b - .llong 40b,140b - .llong 86b,186b - .llong 41b,141b - .llong 87b,187b - .llong 42b,142b - .llong 88b,188b - .llong 43b,143b - .llong 89b,189b - .llong 90b,190b - .llong 91b,191b - .llong 92b,192b - - .text - -/* - * Routine to copy a whole page of data, optimized for POWER4. - * On POWER4 it is more than 50% faster than the simple loop - * above (following the .Ldst_aligned label) but it runs slightly - * slower on POWER3. - */ -.Lcopy_page: - std r31,-32(1) - std r30,-40(1) - std r29,-48(1) - std r28,-56(1) - std r27,-64(1) - std r26,-72(1) - std r25,-80(1) - std r24,-88(1) - std r23,-96(1) - std r22,-104(1) - std r21,-112(1) - std r20,-120(1) - li r5,4096/32 - 1 - addi r3,r3,-8 - li r0,5 -0: addi r5,r5,-24 - mtctr r0 -20: ld r22,640(4) -21: ld r21,512(4) -22: ld r20,384(4) -23: ld r11,256(4) -24: ld r9,128(4) -25: ld r7,0(4) -26: ld r25,648(4) -27: ld r24,520(4) -28: ld r23,392(4) -29: ld r10,264(4) -30: ld r8,136(4) -31: ldu r6,8(4) - cmpwi r5,24 -1: -32: std r22,648(3) -33: std r21,520(3) -34: std r20,392(3) -35: std r11,264(3) -36: std r9,136(3) -37: std r7,8(3) -38: ld r28,648(4) -39: ld r27,520(4) -40: ld r26,392(4) -41: ld r31,264(4) -42: ld r30,136(4) -43: ld r29,8(4) -44: std r25,656(3) -45: std r24,528(3) -46: std r23,400(3) -47: std r10,272(3) -48: std r8,144(3) -49: std r6,16(3) -50: ld r22,656(4) -51: ld r21,528(4) -52: ld r20,400(4) -53: ld r11,272(4) -54: ld r9,144(4) -55: ld r7,16(4) -56: std r28,664(3) -57: std r27,536(3) -58: std r26,408(3) -59: std r31,280(3) -60: std r30,152(3) -61: stdu r29,24(3) -62: ld r25,664(4) -63: ld r24,536(4) -64: ld r23,408(4) -65: ld r10,280(4) -66: ld r8,152(4) -67: ldu r6,24(4) - bdnz 1b -68: std r22,648(3) -69: std r21,520(3) -70: std r20,392(3) -71: std r11,264(3) -72: std r9,136(3) -73: std r7,8(3) -74: addi r4,r4,640 -75: addi r3,r3,648 - bge 0b - mtctr r5 -76: ld r7,0(4) -77: ld r8,8(4) -78: ldu r9,16(4) -3: -79: ld r10,8(4) -80: std r7,8(3) -81: ld r7,16(4) -82: std r8,16(3) -83: ld r8,24(4) -84: std r9,24(3) -85: ldu r9,32(4) -86: stdu r10,32(3) - bdnz 3b -4: -87: ld r10,8(4) -88: std r7,8(3) -89: std r8,16(3) -90: std r9,24(3) -91: std r10,32(3) -9: ld r20,-120(1) - ld r21,-112(1) - ld r22,-104(1) - ld r23,-96(1) - ld r24,-88(1) - ld r25,-80(1) - ld r26,-72(1) - ld r27,-64(1) - ld r28,-56(1) - ld r29,-48(1) - ld r30,-40(1) - ld r31,-32(1) - li r3,0 - blr - -/* - * on an exception, reset to the beginning and jump back into the - * standard __copy_tofrom_user - */ -100: ld r20,-120(1) - ld r21,-112(1) - ld r22,-104(1) - ld r23,-96(1) - ld r24,-88(1) - ld r25,-80(1) - ld r26,-72(1) - ld r27,-64(1) - ld r28,-56(1) - ld r29,-48(1) - ld r30,-40(1) - ld r31,-32(1) - ld r3,-24(r1) - ld r4,-16(r1) - li r5,4096 - b .Ldst_aligned - - .section __ex_table,"a" - .align 3 - .llong 20b,100b - .llong 21b,100b - .llong 22b,100b - .llong 23b,100b - .llong 24b,100b - .llong 25b,100b - .llong 26b,100b - .llong 27b,100b - .llong 28b,100b - .llong 29b,100b - .llong 30b,100b - .llong 31b,100b - .llong 32b,100b - .llong 33b,100b - .llong 34b,100b - .llong 35b,100b - .llong 36b,100b - .llong 37b,100b - .llong 38b,100b - .llong 39b,100b - .llong 40b,100b - .llong 41b,100b - .llong 42b,100b - .llong 43b,100b - .llong 44b,100b - .llong 45b,100b - .llong 46b,100b - .llong 47b,100b - .llong 48b,100b - .llong 49b,100b - .llong 50b,100b - .llong 51b,100b - .llong 52b,100b - .llong 53b,100b - .llong 54b,100b - .llong 55b,100b - .llong 56b,100b - .llong 57b,100b - .llong 58b,100b - .llong 59b,100b - .llong 60b,100b - .llong 61b,100b - .llong 62b,100b - .llong 63b,100b - .llong 64b,100b - .llong 65b,100b - .llong 66b,100b - .llong 67b,100b - .llong 68b,100b - .llong 69b,100b - .llong 70b,100b - .llong 71b,100b - .llong 72b,100b - .llong 73b,100b - .llong 74b,100b - .llong 75b,100b - .llong 76b,100b - .llong 77b,100b - .llong 78b,100b - .llong 79b,100b - .llong 80b,100b - .llong 81b,100b - .llong 82b,100b - .llong 83b,100b - .llong 84b,100b - .llong 85b,100b - .llong 86b,100b - .llong 87b,100b - .llong 88b,100b - .llong 89b,100b - .llong 90b,100b - .llong 91b,100b diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S new file mode 100644 index 00000000000..a0b3fbbd6fb --- /dev/null +++ b/arch/powerpc/lib/copyuser_64.S @@ -0,0 +1,576 @@ +/* + * arch/ppc64/lib/copyuser.S + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + + .align 7 +_GLOBAL(__copy_tofrom_user) + /* first check for a whole page copy on a page boundary */ + cmpldi cr1,r5,16 + cmpdi cr6,r5,4096 + or r0,r3,r4 + neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + andi. r0,r0,4095 + std r3,-24(r1) + crand cr0*4+2,cr0*4+2,cr6*4+2 + std r4,-16(r1) + std r5,-8(r1) + dcbt 0,r4 + beq .Lcopy_page + andi. r6,r6,7 + mtcrf 0x01,r5 + blt cr1,.Lshort_copy + bne .Ldst_unaligned +.Ldst_aligned: + andi. r0,r4,7 + addi r3,r3,-16 + bne .Lsrc_unaligned + srdi r7,r5,4 +20: ld r9,0(r4) + addi r4,r4,-8 + mtctr r7 + andi. r5,r5,7 + bf cr7*4+0,22f + addi r3,r3,8 + addi r4,r4,8 + mr r8,r9 + blt cr1,72f +21: ld r9,8(r4) +70: std r8,8(r3) +22: ldu r8,16(r4) +71: stdu r9,16(r3) + bdnz 21b +72: std r8,8(r3) + beq+ 3f + addi r3,r3,16 +23: ld r9,8(r4) +.Ldo_tail: + bf cr7*4+1,1f + rotldi r9,r9,32 +73: stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + rotldi r9,r9,16 +74: sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + rotldi r9,r9,8 +75: stb r9,0(r3) +3: li r3,0 + blr + +.Lsrc_unaligned: + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpldi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + bt cr7*4+0,28f + +24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ +25: ld r0,8(r4) + sld r6,r9,r10 +26: ldu r9,16(r4) + srd r7,r0,r11 + sld r8,r0,r10 + or r7,r7,r6 + blt cr6,79f +27: ld r0,8(r4) + b 2f + +28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ +29: ldu r9,8(r4) + sld r8,r0,r10 + addi r3,r3,-8 + blt cr6,5f +30: ld r0,8(r4) + srd r12,r9,r11 + sld r6,r9,r10 +31: ldu r9,16(r4) + or r12,r8,r12 + srd r7,r0,r11 + sld r8,r0,r10 + addi r3,r3,16 + beq cr6,78f + +1: or r7,r7,r6 +32: ld r0,8(r4) +76: std r12,8(r3) +2: srd r12,r9,r11 + sld r6,r9,r10 +33: ldu r9,16(r4) + or r12,r8,r12 +77: stdu r7,16(r3) + srd r7,r0,r11 + sld r8,r0,r10 + bdnz 1b + +78: std r12,8(r3) + or r7,r7,r6 +79: std r7,16(r3) +5: srd r12,r9,r11 + or r12,r8,r12 +80: std r12,24(r3) + bne 6f + li r3,0 + blr +6: cmpwi cr1,r5,8 + addi r3,r3,32 + sld r9,r9,r10 + ble cr1,.Ldo_tail +34: ld r0,8(r4) + srd r7,r0,r11 + or r9,r7,r9 + b .Ldo_tail + +.Ldst_unaligned: + mtcrf 0x01,r6 /* put #bytes to 8B bdry into cr7 */ + subf r5,r6,r5 + li r7,0 + cmpldi r1,r5,16 + bf cr7*4+3,1f +35: lbz r0,0(r4) +81: stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f +36: lhzx r0,r7,r4 +82: sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f +37: lwzx r0,r7,r4 +83: stwx r0,r7,r3 +3: mtcrf 0x01,r5 + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: + bf cr7*4+0,1f +38: lwz r0,0(r4) +39: lwz r9,4(r4) + addi r4,r4,8 +84: stw r0,0(r3) +85: stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f +40: lwz r0,0(r4) + addi r4,r4,4 +86: stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f +41: lhz r0,0(r4) + addi r4,r4,2 +87: sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f +42: lbz r0,0(r4) +88: stb r0,0(r3) +4: li r3,0 + blr + +/* + * exception handlers follow + * we have to return the number of bytes not copied + * for an exception on a load, we set the rest of the destination to 0 + */ + +136: +137: + add r3,r3,r7 + b 1f +130: +131: + addi r3,r3,8 +120: +122: +124: +125: +126: +127: +128: +129: +133: + addi r3,r3,8 +121: +132: + addi r3,r3,8 +123: +134: +135: +138: +139: +140: +141: +142: + +/* + * here we have had a fault on a load and r3 points to the first + * unmodified byte of the destination + */ +1: ld r6,-24(r1) + ld r4,-16(r1) + ld r5,-8(r1) + subf r6,r6,r3 + add r4,r4,r6 + subf r5,r6,r5 /* #bytes left to go */ + +/* + * first see if we can copy any more bytes before hitting another exception + */ + mtctr r5 +43: lbz r0,0(r4) + addi r4,r4,1 +89: stb r0,0(r3) + addi r3,r3,1 + bdnz 43b + li r3,0 /* huh? all copied successfully this time? */ + blr + +/* + * here we have trapped again, need to clear ctr bytes starting at r3 + */ +143: mfctr r5 + li r0,0 + mr r4,r3 + mr r3,r5 /* return the number of bytes not copied */ +1: andi. r9,r4,7 + beq 3f +90: stb r0,0(r4) + addic. r5,r5,-1 + addi r4,r4,1 + bne 1b + blr +3: cmpldi cr1,r5,8 + srdi r9,r5,3 + andi. r5,r5,7 + blt cr1,93f + mtctr r9 +91: std r0,0(r4) + addi r4,r4,8 + bdnz 91b +93: beqlr + mtctr r5 +92: stb r0,0(r4) + addi r4,r4,1 + bdnz 92b + blr + +/* + * exception handlers for stores: we just need to work + * out how many bytes weren't copied + */ +182: +183: + add r3,r3,r7 + b 1f +180: + addi r3,r3,8 +171: +177: + addi r3,r3,8 +170: +172: +176: +178: + addi r3,r3,4 +185: + addi r3,r3,4 +173: +174: +175: +179: +181: +184: +186: +187: +188: +189: +1: + ld r6,-24(r1) + ld r5,-8(r1) + add r6,r6,r5 + subf r3,r3,r6 /* #bytes not copied */ +190: +191: +192: + blr /* #bytes not copied in r3 */ + + .section __ex_table,"a" + .align 3 + .llong 20b,120b + .llong 21b,121b + .llong 70b,170b + .llong 22b,122b + .llong 71b,171b + .llong 72b,172b + .llong 23b,123b + .llong 73b,173b + .llong 74b,174b + .llong 75b,175b + .llong 24b,124b + .llong 25b,125b + .llong 26b,126b + .llong 27b,127b + .llong 28b,128b + .llong 29b,129b + .llong 30b,130b + .llong 31b,131b + .llong 32b,132b + .llong 76b,176b + .llong 33b,133b + .llong 77b,177b + .llong 78b,178b + .llong 79b,179b + .llong 80b,180b + .llong 34b,134b + .llong 35b,135b + .llong 81b,181b + .llong 36b,136b + .llong 82b,182b + .llong 37b,137b + .llong 83b,183b + .llong 38b,138b + .llong 39b,139b + .llong 84b,184b + .llong 85b,185b + .llong 40b,140b + .llong 86b,186b + .llong 41b,141b + .llong 87b,187b + .llong 42b,142b + .llong 88b,188b + .llong 43b,143b + .llong 89b,189b + .llong 90b,190b + .llong 91b,191b + .llong 92b,192b + + .text + +/* + * Routine to copy a whole page of data, optimized for POWER4. + * On POWER4 it is more than 50% faster than the simple loop + * above (following the .Ldst_aligned label) but it runs slightly + * slower on POWER3. + */ +.Lcopy_page: + std r31,-32(1) + std r30,-40(1) + std r29,-48(1) + std r28,-56(1) + std r27,-64(1) + std r26,-72(1) + std r25,-80(1) + std r24,-88(1) + std r23,-96(1) + std r22,-104(1) + std r21,-112(1) + std r20,-120(1) + li r5,4096/32 - 1 + addi r3,r3,-8 + li r0,5 +0: addi r5,r5,-24 + mtctr r0 +20: ld r22,640(4) +21: ld r21,512(4) +22: ld r20,384(4) +23: ld r11,256(4) +24: ld r9,128(4) +25: ld r7,0(4) +26: ld r25,648(4) +27: ld r24,520(4) +28: ld r23,392(4) +29: ld r10,264(4) +30: ld r8,136(4) +31: ldu r6,8(4) + cmpwi r5,24 +1: +32: std r22,648(3) +33: std r21,520(3) +34: std r20,392(3) +35: std r11,264(3) +36: std r9,136(3) +37: std r7,8(3) +38: ld r28,648(4) +39: ld r27,520(4) +40: ld r26,392(4) +41: ld r31,264(4) +42: ld r30,136(4) +43: ld r29,8(4) +44: std r25,656(3) +45: std r24,528(3) +46: std r23,400(3) +47: std r10,272(3) +48: std r8,144(3) +49: std r6,16(3) +50: ld r22,656(4) +51: ld r21,528(4) +52: ld r20,400(4) +53: ld r11,272(4) +54: ld r9,144(4) +55: ld r7,16(4) +56: std r28,664(3) +57: std r27,536(3) +58: std r26,408(3) +59: std r31,280(3) +60: std r30,152(3) +61: stdu r29,24(3) +62: ld r25,664(4) +63: ld r24,536(4) +64: ld r23,408(4) +65: ld r10,280(4) +66: ld r8,152(4) +67: ldu r6,24(4) + bdnz 1b +68: std r22,648(3) +69: std r21,520(3) +70: std r20,392(3) +71: std r11,264(3) +72: std r9,136(3) +73: std r7,8(3) +74: addi r4,r4,640 +75: addi r3,r3,648 + bge 0b + mtctr r5 +76: ld r7,0(4) +77: ld r8,8(4) +78: ldu r9,16(4) +3: +79: ld r10,8(4) +80: std r7,8(3) +81: ld r7,16(4) +82: std r8,16(3) +83: ld r8,24(4) +84: std r9,24(3) +85: ldu r9,32(4) +86: stdu r10,32(3) + bdnz 3b +4: +87: ld r10,8(4) +88: std r7,8(3) +89: std r8,16(3) +90: std r9,24(3) +91: std r10,32(3) +9: ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + li r3,0 + blr + +/* + * on an exception, reset to the beginning and jump back into the + * standard __copy_tofrom_user + */ +100: ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + ld r3,-24(r1) + ld r4,-16(r1) + li r5,4096 + b .Ldst_aligned + + .section __ex_table,"a" + .align 3 + .llong 20b,100b + .llong 21b,100b + .llong 22b,100b + .llong 23b,100b + .llong 24b,100b + .llong 25b,100b + .llong 26b,100b + .llong 27b,100b + .llong 28b,100b + .llong 29b,100b + .llong 30b,100b + .llong 31b,100b + .llong 32b,100b + .llong 33b,100b + .llong 34b,100b + .llong 35b,100b + .llong 36b,100b + .llong 37b,100b + .llong 38b,100b + .llong 39b,100b + .llong 40b,100b + .llong 41b,100b + .llong 42b,100b + .llong 43b,100b + .llong 44b,100b + .llong 45b,100b + .llong 46b,100b + .llong 47b,100b + .llong 48b,100b + .llong 49b,100b + .llong 50b,100b + .llong 51b,100b + .llong 52b,100b + .llong 53b,100b + .llong 54b,100b + .llong 55b,100b + .llong 56b,100b + .llong 57b,100b + .llong 58b,100b + .llong 59b,100b + .llong 60b,100b + .llong 61b,100b + .llong 62b,100b + .llong 63b,100b + .llong 64b,100b + .llong 65b,100b + .llong 66b,100b + .llong 67b,100b + .llong 68b,100b + .llong 69b,100b + .llong 70b,100b + .llong 71b,100b + .llong 72b,100b + .llong 73b,100b + .llong 74b,100b + .llong 75b,100b + .llong 76b,100b + .llong 77b,100b + .llong 78b,100b + .llong 79b,100b + .llong 80b,100b + .llong 81b,100b + .llong 82b,100b + .llong 83b,100b + .llong 84b,100b + .llong 85b,100b + .llong 86b,100b + .llong 87b,100b + .llong 88b,100b + .llong 89b,100b + .llong 90b,100b + .llong 91b,100b diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S new file mode 100644 index 00000000000..68df20283ff --- /dev/null +++ b/arch/powerpc/lib/mem_64.S @@ -0,0 +1,119 @@ +/* + * String handling functions for PowerPC. + * + * Copyright (C) 1996 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +_GLOBAL(memset) + neg r0,r3 + rlwimi r4,r4,8,16,23 + andi. r0,r0,7 /* # bytes to be 8-byte aligned */ + rlwimi r4,r4,16,0,15 + cmplw cr1,r5,r0 /* do we get that far? */ + rldimi r4,r4,32,0 + mtcrf 1,r0 + mr r6,r3 + blt cr1,8f + beq+ 3f /* if already 8-byte aligned */ + subf r5,r0,r5 + bf 31,1f + stb r4,0(r6) + addi r6,r6,1 +1: bf 30,2f + sth r4,0(r6) + addi r6,r6,2 +2: bf 29,3f + stw r4,0(r6) + addi r6,r6,4 +3: srdi. r0,r5,6 + clrldi r5,r5,58 + mtctr r0 + beq 5f +4: std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + std r4,32(r6) + std r4,40(r6) + std r4,48(r6) + std r4,56(r6) + addi r6,r6,64 + bdnz 4b +5: srwi. r0,r5,3 + clrlwi r5,r5,29 + mtcrf 1,r0 + beq 8f + bf 29,6f + std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + addi r6,r6,32 +6: bf 30,7f + std r4,0(r6) + std r4,8(r6) + addi r6,r6,16 +7: bf 31,8f + std r4,0(r6) + addi r6,r6,8 +8: cmpwi r5,0 + mtcrf 1,r5 + beqlr+ + bf 29,9f + stw r4,0(r6) + addi r6,r6,4 +9: bf 30,10f + sth r4,0(r6) + addi r6,r6,2 +10: bflr 31 + stb r4,0(r6) + blr + +_GLOBAL(memmove) + cmplw 0,r3,r4 + bgt .backwards_memcpy + b .memcpy + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b diff --git a/arch/powerpc/lib/memcpy.S b/arch/powerpc/lib/memcpy.S deleted file mode 100644 index 9ccacdf5bcb..00000000000 --- a/arch/powerpc/lib/memcpy.S +++ /dev/null @@ -1,172 +0,0 @@ -/* - * arch/ppc64/lib/memcpy.S - * - * Copyright (C) 2002 Paul Mackerras, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include - - .align 7 -_GLOBAL(memcpy) - mtcrf 0x01,r5 - cmpldi cr1,r5,16 - neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry - andi. r6,r6,7 - dcbt 0,r4 - blt cr1,.Lshort_copy - bne .Ldst_unaligned -.Ldst_aligned: - andi. r0,r4,7 - addi r3,r3,-16 - bne .Lsrc_unaligned - srdi r7,r5,4 - ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,2f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,3f -1: ld r9,8(r4) - std r8,8(r3) -2: ldu r8,16(r4) - stdu r9,16(r3) - bdnz 1b -3: std r8,8(r3) - beqlr - addi r3,r3,16 - ld r9,8(r4) -.Ldo_tail: - bf cr7*4+1,1f - rotldi r9,r9,32 - stw r9,0(r3) - addi r3,r3,4 -1: bf cr7*4+2,2f - rotldi r9,r9,16 - sth r9,0(r3) - addi r3,r3,2 -2: bf cr7*4+3,3f - rotldi r9,r9,8 - stb r9,0(r3) -3: blr - -.Lsrc_unaligned: - srdi r6,r5,3 - addi r5,r5,-16 - subf r4,r0,r4 - srdi r7,r5,4 - sldi r10,r0,3 - cmpdi cr6,r6,3 - andi. r5,r5,7 - mtctr r7 - subfic r11,r10,64 - add r5,r5,r0 - - bt cr7*4+0,0f - - ld r9,0(r4) # 3+2n loads, 2+2n stores - ld r0,8(r4) - sld r6,r9,r10 - ldu r9,16(r4) - srd r7,r0,r11 - sld r8,r0,r10 - or r7,r7,r6 - blt cr6,4f - ld r0,8(r4) - # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12 - b 2f - -0: ld r0,0(r4) # 4+2n loads, 3+2n stores - ldu r9,8(r4) - sld r8,r0,r10 - addi r3,r3,-8 - blt cr6,5f - ld r0,8(r4) - srd r12,r9,r11 - sld r6,r9,r10 - ldu r9,16(r4) - or r12,r8,r12 - srd r7,r0,r11 - sld r8,r0,r10 - addi r3,r3,16 - beq cr6,3f - - # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9 -1: or r7,r7,r6 - ld r0,8(r4) - std r12,8(r3) -2: srd r12,r9,r11 - sld r6,r9,r10 - ldu r9,16(r4) - or r12,r8,r12 - stdu r7,16(r3) - srd r7,r0,r11 - sld r8,r0,r10 - bdnz 1b - -3: std r12,8(r3) - or r7,r7,r6 -4: std r7,16(r3) -5: srd r12,r9,r11 - or r12,r8,r12 - std r12,24(r3) - beqlr - cmpwi cr1,r5,8 - addi r3,r3,32 - sld r9,r9,r10 - ble cr1,.Ldo_tail - ld r0,8(r4) - srd r7,r0,r11 - or r9,r7,r9 - b .Ldo_tail - -.Ldst_unaligned: - mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7 - subf r5,r6,r5 - li r7,0 - cmpldi r1,r5,16 - bf cr7*4+3,1f - lbz r0,0(r4) - stb r0,0(r3) - addi r7,r7,1 -1: bf cr7*4+2,2f - lhzx r0,r7,r4 - sthx r0,r7,r3 - addi r7,r7,2 -2: bf cr7*4+1,3f - lwzx r0,r7,r4 - stwx r0,r7,r3 -3: mtcrf 0x01,r5 - add r4,r6,r4 - add r3,r6,r3 - b .Ldst_aligned - -.Lshort_copy: - bf cr7*4+0,1f - lwz r0,0(r4) - lwz r9,4(r4) - addi r4,r4,8 - stw r0,0(r3) - stw r9,4(r3) - addi r3,r3,8 -1: bf cr7*4+1,2f - lwz r0,0(r4) - addi r4,r4,4 - stw r0,0(r3) - addi r3,r3,4 -2: bf cr7*4+2,3f - lhz r0,0(r4) - addi r4,r4,2 - sth r0,0(r3) - addi r3,r3,2 -3: bf cr7*4+3,4f - lbz r0,0(r4) - stb r0,0(r3) -4: blr diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S new file mode 100644 index 00000000000..9ccacdf5bcb --- /dev/null +++ b/arch/powerpc/lib/memcpy_64.S @@ -0,0 +1,172 @@ +/* + * arch/ppc64/lib/memcpy.S + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + + .align 7 +_GLOBAL(memcpy) + mtcrf 0x01,r5 + cmpldi cr1,r5,16 + neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry + andi. r6,r6,7 + dcbt 0,r4 + blt cr1,.Lshort_copy + bne .Ldst_unaligned +.Ldst_aligned: + andi. r0,r4,7 + addi r3,r3,-16 + bne .Lsrc_unaligned + srdi r7,r5,4 + ld r9,0(r4) + addi r4,r4,-8 + mtctr r7 + andi. r5,r5,7 + bf cr7*4+0,2f + addi r3,r3,8 + addi r4,r4,8 + mr r8,r9 + blt cr1,3f +1: ld r9,8(r4) + std r8,8(r3) +2: ldu r8,16(r4) + stdu r9,16(r3) + bdnz 1b +3: std r8,8(r3) + beqlr + addi r3,r3,16 + ld r9,8(r4) +.Ldo_tail: + bf cr7*4+1,1f + rotldi r9,r9,32 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + rotldi r9,r9,16 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + rotldi r9,r9,8 + stb r9,0(r3) +3: blr + +.Lsrc_unaligned: + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpdi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + + bt cr7*4+0,0f + + ld r9,0(r4) # 3+2n loads, 2+2n stores + ld r0,8(r4) + sld r6,r9,r10 + ldu r9,16(r4) + srd r7,r0,r11 + sld r8,r0,r10 + or r7,r7,r6 + blt cr6,4f + ld r0,8(r4) + # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12 + b 2f + +0: ld r0,0(r4) # 4+2n loads, 3+2n stores + ldu r9,8(r4) + sld r8,r0,r10 + addi r3,r3,-8 + blt cr6,5f + ld r0,8(r4) + srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + srd r7,r0,r11 + sld r8,r0,r10 + addi r3,r3,16 + beq cr6,3f + + # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9 +1: or r7,r7,r6 + ld r0,8(r4) + std r12,8(r3) +2: srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + stdu r7,16(r3) + srd r7,r0,r11 + sld r8,r0,r10 + bdnz 1b + +3: std r12,8(r3) + or r7,r7,r6 +4: std r7,16(r3) +5: srd r12,r9,r11 + or r12,r8,r12 + std r12,24(r3) + beqlr + cmpwi cr1,r5,8 + addi r3,r3,32 + sld r9,r9,r10 + ble cr1,.Ldo_tail + ld r0,8(r4) + srd r7,r0,r11 + or r9,r7,r9 + b .Ldo_tail + +.Ldst_unaligned: + mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7 + subf r5,r6,r5 + li r7,0 + cmpldi r1,r5,16 + bf cr7*4+3,1f + lbz r0,0(r4) + stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r4 + sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f + lwzx r0,r7,r4 + stwx r0,r7,r3 +3: mtcrf 0x01,r5 + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: + bf cr7*4+0,1f + lwz r0,0(r4) + lwz r9,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f + lbz r0,0(r4) + stb r0,0(r3) +4: blr diff --git a/arch/powerpc/lib/usercopy.c b/arch/powerpc/lib/usercopy.c deleted file mode 100644 index 5eea6f3c1e0..00000000000 --- a/arch/powerpc/lib/usercopy.c +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Functions which are too large to be inlined. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include - -unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) -{ - if (likely(access_ok(VERIFY_READ, from, n))) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} - -unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) -{ - if (likely(access_ok(VERIFY_WRITE, to, n))) - n = __copy_to_user(to, from, n); - return n; -} - -unsigned long copy_in_user(void __user *to, const void __user *from, - unsigned long n) -{ - might_sleep(); - if (likely(access_ok(VERIFY_READ, from, n) && - access_ok(VERIFY_WRITE, to, n))) - n =__copy_tofrom_user(to, from, n); - return n; -} - -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); - diff --git a/arch/powerpc/lib/usercopy_64.c b/arch/powerpc/lib/usercopy_64.c new file mode 100644 index 00000000000..5eea6f3c1e0 --- /dev/null +++ b/arch/powerpc/lib/usercopy_64.c @@ -0,0 +1,41 @@ +/* + * Functions which are too large to be inlined. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) +{ + if (likely(access_ok(VERIFY_READ, from, n))) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} + +unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) +{ + if (likely(access_ok(VERIFY_WRITE, to, n))) + n = __copy_to_user(to, from, n); + return n; +} + +unsigned long copy_in_user(void __user *to, const void __user *from, + unsigned long n) +{ + might_sleep(); + if (likely(access_ok(VERIFY_READ, from, n) && + access_ok(VERIFY_WRITE, to, n))) + n =__copy_tofrom_user(to, from, n); + return n; +} + +EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(copy_in_user); + -- cgit v1.2.3