From e16a9c0990a18ee2f2874363392fdfedcf9ded58 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Thu, 31 Jul 2008 13:51:42 +1000 Subject: powerpc: Guard htab_dt_scan_hugepage_blocks appropriately htab_dt_scan_hugepage_blocks is only used when CONFIG_HUGETLB_PAGE is defined, so guard the declaration likewise. Signed-off-by: Tony Breeds Signed-off-by: Paul Mackerras --- arch/powerpc/mm/hash_utils_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 14be408dfc9..7efcb9c5217 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -343,6 +343,7 @@ static int __init htab_dt_scan_page_sizes(unsigned long node, return 0; } +#ifdef CONFIG_HUGETLB_PAGE /* Scan for 16G memory blocks that have been set aside for huge pages * and reserve those blocks for 16G huge pages. */ @@ -380,6 +381,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, add_gpage(phys_addr, block_size, expected_pages); return 0; } +#endif /* CONFIG_HUGETLB_PAGE */ static void __init htab_init_page_sizes(void) { -- cgit v1.2.3 From cf00085d8045cddd80a8aabad97de96fa8131793 Mon Sep 17 00:00:00 2001 From: Chandru Date: Sat, 30 Aug 2008 00:28:16 +1000 Subject: powerpc: Add support for dynamic reconfiguration memory in kexec/kdump kernels Kdump kernel needs to use only those memory regions that it is allowed to use (crashkernel, rtas, tce, etc.). Each of these regions have their own sizes and are currently added under 'linux,usable-memory' property under each memory@xxx node of the device tree. The ibm,dynamic-memory property of ibm,dynamic-reconfiguration-memory node (on POWER6) now stores in it the representation for most of the logical memory blocks with the size of each memory block being a constant (lmb_size). If one or more or part of the above mentioned regions lie under one of the lmb from ibm,dynamic-memory property, there is a need to identify those regions within the given lmb. This makes the kernel recognize a new 'linux,drconf-usable-memory' property added by kexec-tools. Each entry in this property is of the form of a count followed by that many (base, size) pairs for the above mentioned regions. The number of cells in the count value is given by the #size-cells property of the root node. Signed-off-by: Chandru Siddalingappa Signed-off-by: Paul Mackerras --- arch/powerpc/mm/numa.c | 77 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 61 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d9a18135133..be05457631d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -150,6 +150,21 @@ static const int *of_get_associativity(struct device_node *dev) return of_get_property(dev, "ibm,associativity", NULL); } +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const u32 *of_get_usable_memory(struct device_node *memory) +{ + const u32 *prop; + u32 len; + prop = of_get_property(memory, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + return prop; +} + /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ @@ -486,15 +501,30 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start, return lmb_end_of_DRAM() - start; } +/* + * Reads the counter for a given entry in + * linux,drconf-usable-memory property + */ +static inline int __init read_usm_ranges(const u32 **usm) +{ + /* + * For each lmb in ibm,dynamic-memory a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter followed by that many (base, size) duple. + * read the counter from linux,drconf-usable-memory + */ + return read_n_cells(n_mem_size_cells, usm); +} + /* * Extract NUMA information from the ibm,dynamic-reconfiguration-memory * node. This assumes n_mem_{addr,size}_cells have been set. */ static void __init parse_drconf_memory(struct device_node *memory) { - const u32 *dm; - unsigned int n, rc; - unsigned long lmb_size, size; + const u32 *dm, *usm; + unsigned int n, rc, ranges, is_kexec_kdump = 0; + unsigned long lmb_size, base, size, sz; int nid; struct assoc_arrays aa; @@ -510,6 +540,11 @@ static void __init parse_drconf_memory(struct device_node *memory) if (rc) return; + /* check if this is a kexec/kdump kernel */ + usm = of_get_usable_memory(memory); + if (usm != NULL) + is_kexec_kdump = 1; + for (; n != 0; --n) { struct of_drconf_cell drmem; @@ -521,21 +556,31 @@ static void __init parse_drconf_memory(struct device_node *memory) || !(drmem.flags & DRCONF_MEM_ASSIGNED)) continue; - nid = of_drconf_to_nid_single(&drmem, &aa); + base = drmem.base_addr; + size = lmb_size; + ranges = 1; - fake_numa_create_new_node( - ((drmem.base_addr + lmb_size) >> PAGE_SHIFT), + if (is_kexec_kdump) { + ranges = read_usm_ranges(&usm); + if (!ranges) /* there are no (base, size) duple */ + continue; + } + do { + if (is_kexec_kdump) { + base = read_n_cells(n_mem_addr_cells, &usm); + size = read_n_cells(n_mem_size_cells, &usm); + } + nid = of_drconf_to_nid_single(&drmem, &aa); + fake_numa_create_new_node( + ((base + size) >> PAGE_SHIFT), &nid); - - node_set_online(nid); - - size = numa_enforce_memory_limit(drmem.base_addr, lmb_size); - if (!size) - continue; - - add_active_range(nid, drmem.base_addr >> PAGE_SHIFT, - (drmem.base_addr >> PAGE_SHIFT) - + (size >> PAGE_SHIFT)); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + add_active_range(nid, base >> PAGE_SHIFT, + (base >> PAGE_SHIFT) + + (sz >> PAGE_SHIFT)); + } while (--ranges); } } -- cgit v1.2.3 From 549e8152de8039506f69c677a4546e5427aa6ae7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 30 Aug 2008 11:43:47 +1000 Subject: powerpc: Make the 64-bit kernel as a position-independent executable This implements CONFIG_RELOCATABLE for 64-bit by making the kernel as a position-independent executable (PIE) when it is set. This involves processing the dynamic relocations in the image in the early stages of booting, even if the kernel is being run at the address it is linked at, since the linker does not necessarily fill in words in the image for which there are dynamic relocations. (In fact the linker does fill in such words for 64-bit executables, though not for 32-bit executables, so in principle we could avoid calling relocate() entirely when we're running a 64-bit kernel at the linked address.) The dynamic relocations are processed by a new function relocate(addr), where the addr parameter is the virtual address where the image will be run. In fact we call it twice; once before calling prom_init, and again when starting the main kernel. This means that reloc_offset() returns 0 in prom_init (since it has been relocated to the address it is running at), which necessitated a few adjustments. This also changes __va and __pa to use an equivalent definition that is simpler. With the relocatable kernel, PAGE_OFFSET and MEMORY_START are constants (for 64-bit) whereas PHYSICAL_START is a variable (and KERNELBASE ideally should be too, but isn't yet). With this, relocatable kernels still copy themselves down to physical address 0 and run there. Signed-off-by: Paul Mackerras --- arch/powerpc/mm/hash_utils_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index d666f71f1f3..09db4efe192 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -194,7 +194,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long tprot = prot; /* Make kernel text executable */ - if (in_kernel_text(vaddr)) + if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; hash = hpt_hash(va, shift, ssize); -- cgit v1.2.3 From aaf4a9b0f78786e6915077cbbb1d6f4fb6a8ee0b Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Thu, 4 Sep 2008 01:37:53 +1000 Subject: powerpc: Rename PTE_SIZE to HPTE_SIZE It's the size of the hardware PTE; make that clear in the name. Signed-off-by: Becky Bruce Acked-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/mm/hash_low_32.S | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index b9ba7d93080..c41d658176a 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -285,7 +285,7 @@ Hash_bits = 12 /* e.g. 256kB hash table */ Hash_msk = (((1 << Hash_bits) - 1) * 64) /* defines for the PTE format for 32-bit PPCs */ -#define PTE_SIZE 8 +#define HPTE_SIZE 8 #define PTEG_SIZE 64 #define LG_PTEG_SIZE 6 #define LDPTEu lwzu @@ -342,8 +342,8 @@ _GLOBAL(hash_page_patch_A) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 - addi r4,r3,-PTE_SIZE -1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */ + addi r4,r3,-HPTE_SIZE +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ CMPPTE 0,r6,r5 bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ found_slot @@ -353,9 +353,9 @@ _GLOBAL(hash_page_patch_A) _GLOBAL(hash_page_patch_B) xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-PTE_SIZE + addi r4,r4,-HPTE_SIZE mtctr r0 -2: LDPTEu r6,PTE_SIZE(r4) +2: LDPTEu r6,HPTE_SIZE(r4) CMPPTE 0,r6,r5 bdnzf 2,2b beq+ found_slot @@ -363,8 +363,8 @@ _GLOBAL(hash_page_patch_B) /* Search the primary PTEG for an empty slot */ 10: mtctr r0 - addi r4,r3,-PTE_SIZE /* search primary PTEG */ -1: LDPTEu r6,PTE_SIZE(r4) /* get next PTE */ + addi r4,r3,-HPTE_SIZE /* search primary PTEG */ +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ TST_V(r6) /* test valid bit */ bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ found_empty @@ -380,9 +380,9 @@ _GLOBAL(hash_page_patch_B) _GLOBAL(hash_page_patch_C) xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ xori r4,r4,(-PTEG_SIZE & 0xffff) - addi r4,r4,-PTE_SIZE + addi r4,r4,-HPTE_SIZE mtctr r0 -2: LDPTEu r6,PTE_SIZE(r4) +2: LDPTEu r6,HPTE_SIZE(r4) TST_V(r6) bdnzf 2,2b beq+ found_empty @@ -409,11 +409,11 @@ _GLOBAL(hash_page_patch_C) 1: addis r4,r7,next_slot@ha /* get next evict slot */ lwz r6,next_slot@l(r4) - addi r6,r6,PTE_SIZE /* search for candidate */ - andi. r6,r6,7*PTE_SIZE + addi r6,r6,HPTE_SIZE /* search for candidate */ + andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) add r4,r3,r6 - LDPTE r0,PTE_SIZE/2(r4) /* get PTE second word */ + LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ clrrwi r0,r0,12 lis r6,etext@h ori r6,r6,etext@l /* get etext */ @@ -426,7 +426,7 @@ _GLOBAL(hash_page_patch_C) found_empty: STPTE r5,0(r4) found_slot: - STPTE r8,PTE_SIZE/2(r4) + STPTE r8,HPTE_SIZE/2(r4) #else /* CONFIG_SMP */ /* @@ -452,7 +452,7 @@ found_slot: STPTE r5,0(r4) sync TLBSYNC - STPTE r8,PTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ + STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ sync SET_V(r5) STPTE r5,0(r4) /* finally set V bit in PTE */ @@ -562,8 +562,8 @@ _GLOBAL(flush_hash_patch_A) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ li r0,8 /* PTEs/group */ mtctr r0 - addi r12,r8,-PTE_SIZE -1: LDPTEu r0,PTE_SIZE(r12) /* get next PTE */ + addi r12,r8,-HPTE_SIZE +1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ CMPPTE 0,r0,r11 bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ beq+ 3f @@ -574,9 +574,9 @@ _GLOBAL(flush_hash_patch_A) _GLOBAL(flush_hash_patch_B) xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ xori r12,r12,(-PTEG_SIZE & 0xffff) - addi r12,r12,-PTE_SIZE + addi r12,r12,-HPTE_SIZE mtctr r0 -2: LDPTEu r0,PTE_SIZE(r12) +2: LDPTEu r0,HPTE_SIZE(r12) CMPPTE 0,r0,r11 bdnzf 2,2b xori r11,r11,PTE_H /* clear H again */ -- cgit v1.2.3 From 0b26425ce101a7c1b1ad4ec353d4e860223d9fc4 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 5 Sep 2008 11:49:54 +1000 Subject: powerpc: Clean up hugepage pagetable allocation for powerpc with 16G pages There is a small bug in the handling of 16G hugepages recently added to the kernel. This doesn't cause a crash or other user-visible problems, but it does mean that more levels of pagetable are allocated than makes sense for 16G pages. The hugepage pagetables for the 16G pages are allocated much lower in the pagetable tree than they should be, with the intervening levels allocated with full pmd and pud pages which will only ever have one entry filled in. This corrects this problem, at the same time cleaning up the handling of which level 64k versus 16M hugepage pagetables are allocated at. The new way of formatting the tests should be more robust against changes in pagetable structure, or any newly added hugepage sizes. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/mm/hugetlbpage.c | 59 ++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f1c2d55b437..a117024ab8c 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -128,29 +128,37 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, return 0; } -/* Base page size affects how we walk hugetlb page tables */ -#ifdef CONFIG_PPC_64K_PAGES -#define hpmd_offset(pud, addr, h) pmd_offset(pud, addr) -#define hpmd_alloc(mm, pud, addr, h) pmd_alloc(mm, pud, addr) -#else -static inline -pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) + +static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PUD_SHIFT) + return pud_offset(pgd, addr); + else + return (pud_t *) pgd; +} +static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, + struct hstate *hstate) { - if (huge_page_shift(hstate) == PAGE_SHIFT_64K) + if (huge_page_shift(hstate) < PUD_SHIFT) + return pud_alloc(mm, pgd, addr); + else + return (pud_t *) pgd; +} +static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) +{ + if (huge_page_shift(hstate) < PMD_SHIFT) return pmd_offset(pud, addr); else return (pmd_t *) pud; } -static inline -pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, - struct hstate *hstate) +static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, + struct hstate *hstate) { - if (huge_page_shift(hstate) == PAGE_SHIFT_64K) + if (huge_page_shift(hstate) < PMD_SHIFT) return pmd_alloc(mm, pud, addr); else return (pmd_t *) pud; } -#endif /* Build list of addresses of gigantic pages. This function is used in early * boot before the buddy or bootmem allocator is setup. @@ -204,7 +212,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pg = pgd_offset(mm, addr); if (!pgd_none(*pg)) { - pu = pud_offset(pg, addr); + pu = hpud_offset(pg, addr, hstate); if (!pud_none(*pu)) { pm = hpmd_offset(pu, addr, hstate); if (!pmd_none(*pm)) @@ -233,7 +241,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, addr &= hstate->mask; pg = pgd_offset(mm, addr); - pu = pud_alloc(mm, pg, addr); + pu = hpud_alloc(mm, pg, addr, hstate); if (pu) { pm = hpmd_alloc(mm, pu, addr, hstate); @@ -316,13 +324,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); -#ifdef CONFIG_PPC_64K_PAGES - if (pud_none_or_clear_bad(pud)) - continue; - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling, - psize); -#else - if (shift == PAGE_SHIFT_64K) { + if (shift < PMD_SHIFT) { if (pud_none_or_clear_bad(pud)) continue; hugetlb_free_pmd_range(tlb, pud, addr, next, floor, @@ -332,7 +334,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, continue; free_hugepte_range(tlb, (hugepd_t *)pud, psize); } -#endif } while (pud++, addr = next, addr != end); start &= PGDIR_MASK; @@ -422,9 +423,15 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, psize = get_slice_psize(tlb->mm, addr); BUG_ON(!mmu_huge_psizes[psize]); next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + if (mmu_psize_to_shift(psize) < PUD_SHIFT) { + if (pgd_none_or_clear_bad(pgd)) + continue; + hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } else { + if (pgd_none(*pgd)) + continue; + free_hugepte_range(tlb, (hugepd_t *)pgd, psize); + } } while (pgd++, addr = next, addr != end); } -- cgit v1.2.3 From 82331ab15f14786f3d8e874efb76462685e3bfa0 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Thu, 21 Aug 2008 13:50:22 -0500 Subject: powerpc/85xx: fix build warning, remove silly cast This fixes a build warning when PHYS_64BIT is enabled, and removes an unnecessary cast to phys_addr_t (the variable being cast is already a phys_addr_t) Signed-off-by: Becky Bruce Signed-off-by: Kumar Gala --- arch/powerpc/mm/fsl_booke_mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index ce10e2b1b90..23cee39534f 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -202,7 +202,7 @@ adjust_total_lowmem(void) cam_max_size = max_lowmem_size; /* adjust lowmem size to max_lowmem_size */ - ram = min(max_lowmem_size, (phys_addr_t)total_lowmem); + ram = min(max_lowmem_size, total_lowmem); /* Calculate CAM values */ __cam0 = 1UL << 2 * (__ilog2(ram) / 2); @@ -225,7 +225,8 @@ adjust_total_lowmem(void) printk(KERN_INFO "Memory CAM mapping: CAM0=%ldMb, CAM1=%ldMb," " CAM2=%ldMb residual: %ldMb\n", __cam0 >> 20, __cam1 >> 20, __cam2 >> 20, - (total_lowmem - __cam0 - __cam1 - __cam2) >> 20); + (long int)((total_lowmem - __cam0 - __cam1 - __cam2) + >> 20)); __max_low_memory = __cam0 + __cam1 + __cam2; __initial_memory_limit_addr = memstart_addr + __max_low_memory; } -- cgit v1.2.3 From 4ee7084eb11e00eb02dc8435fd18273a61ffa9bf Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 24 Sep 2008 11:01:24 -0500 Subject: POWERPC: Allow 32-bit hashed pgtable code to support 36-bit physical This rearranges a bit of code, and adds support for 36-bit physical addressing for configs that use a hashed page table. The 36b physical support is not enabled by default on any config - it must be explicitly enabled via the config system. This patch *only* expands the page table code to accomodate large physical addresses on 32-bit systems and enables the PHYS_64BIT config option for 86xx. It does *not* allow you to boot a board with more than about 3.5GB of RAM - for that, SWIOTLB support is also required (and coming soon). Signed-off-by: Becky Bruce Signed-off-by: Kumar Gala --- arch/powerpc/mm/hash_low_32.S | 86 +++++++++++++++++++++++++++++++++++-------- arch/powerpc/mm/pgtable_32.c | 4 +- arch/powerpc/mm/tlb_32.c | 1 + 3 files changed, 73 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index c41d658176a..7bffb70b9fe 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -75,7 +75,7 @@ _GLOBAL(hash_page_sync) * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, ctr, lr. + * Uses r0, r3 - r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) @@ -106,9 +106,15 @@ _GLOBAL(hash_page) addi r5,r5,swapper_pg_dir@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ 112: add r5,r5,r7 /* convert to phys addr */ +#ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ rlwinm. r8,r8,0,0,19 /* extract address of pte page */ +#else + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ + lwzx r8,r8,r5 /* Get L1 entry */ + rlwinm. r8,r8,0,0,20 /* extract pt base address */ +#endif #ifdef CONFIG_SMP beq- hash_page_out /* return if no mapping */ #else @@ -118,7 +124,11 @@ _GLOBAL(hash_page) to the address following the rfi. */ beqlr- #endif +#ifndef CONFIG_PTE_64BIT rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ +#else + rlwimi r8,r4,23,20,28 /* compute pte address */ +#endif rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE @@ -127,9 +137,15 @@ _GLOBAL(hash_page) * because almost always, there won't be a permission violation * and there won't already be an HPTE, and thus we will have * to update the PTE to set _PAGE_HASHPTE. -- paulus. + * + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. */ +#if (PTE_FLAGS_OFFSET != 0) + addi r8,r8,PTE_FLAGS_OFFSET +#endif retry: - lwarx r6,0,r8 /* get linux-style pte */ + lwarx r6,0,r8 /* get linux-style pte, flag word */ andc. r5,r3,r6 /* check access & ~permission */ #ifdef CONFIG_SMP bne- hash_page_out /* return if access not permitted */ @@ -137,6 +153,15 @@ retry: bnelr- #endif or r5,r0,r6 /* set accessed/dirty bits */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ stwcx. r5,0,r8 /* attempt to update PTE */ bne- retry /* retry if someone got there first */ @@ -203,9 +228,9 @@ _GLOBAL(add_hash_page) * we can't take a hash table miss (assuming the code is * covered by a BAT). -- paulus */ - mfmsr r10 + mfmsr r9 SYNC - rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ rlwinm r0,r0,0,28,26 /* clear MSR_DR */ mtmsr r0 SYNC_601 @@ -214,14 +239,14 @@ _GLOBAL(add_hash_page) tophys(r7,0) #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l -10: lwarx r0,0,r9 /* take the mmu_hash_lock */ + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l +10: lwarx r0,0,r6 /* take the mmu_hash_lock */ cmpi 0,r0,0 bne- 11f - stwcx. r8,0,r9 + stwcx. r8,0,r6 beq+ 12f -11: lwz r0,0(r9) +11: lwz r0,0(r6) cmpi 0,r0,0 beq 10b b 11b @@ -234,10 +259,24 @@ _GLOBAL(add_hash_page) * HPTE, so we just unlock and return. */ mr r8,r5 +#ifndef CONFIG_PTE_64BIT rlwimi r8,r4,22,20,29 +#else + rlwimi r8,r4,23,20,28 + addi r8,r8,PTE_FLAGS_OFFSET +#endif 1: lwarx r6,0,r8 andi. r0,r6,_PAGE_HASHPTE bne 9f /* if HASHPTE already set, done */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ ori r5,r6,_PAGE_HASHPTE stwcx. r5,0,r8 bne- 1b @@ -246,13 +285,15 @@ _GLOBAL(add_hash_page) 9: #ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l eieio li r0,0 - stw r0,0(r9) /* clear mmu_hash_lock */ + stw r0,0(r6) /* clear mmu_hash_lock */ #endif /* reenable interrupts and DR */ - mtmsr r10 + mtmsr r9 SYNC_601 isync @@ -267,7 +308,8 @@ _GLOBAL(add_hash_page) * r5 contains the linux PTE, r6 contains the old value of the * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). + * -KERNELBASE if it is off). r10 contains the upper half of + * the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -313,6 +355,11 @@ _GLOBAL(create_hpte) BEGIN_FTR_SECTION ori r8,r8,_PAGE_COHERENT /* set M (coherence required) */ END_FTR_SECTION_IFSET(CPU_FTR_NEED_COHERENT) +#ifdef CONFIG_PTE_64BIT + /* Put the XPN bits into the PTE */ + rlwimi r8,r10,8,20,22 + rlwimi r8,r10,2,29,29 +#endif /* Construct the high word of the PPC-style PTE (r5) */ rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ @@ -499,14 +546,18 @@ _GLOBAL(flush_hash_pages) isync /* First find a PTE in the range that has _PAGE_HASHPTE set */ +#ifndef CONFIG_PTE_64BIT rlwimi r5,r4,22,20,29 -1: lwz r0,0(r5) +#else + rlwimi r5,r4,23,20,28 +#endif +1: lwz r0,PTE_FLAGS_OFFSET(r5) cmpwi cr1,r6,1 andi. r0,r0,_PAGE_HASHPTE bne 2f ble cr1,19f addi r4,r4,0x1000 - addi r5,r5,4 + addi r5,r5,PTE_SIZE addi r6,r6,-1 b 1b @@ -545,7 +596,10 @@ _GLOBAL(flush_hash_pages) * already clear, we're done (for this pte). If not, * clear it (atomically) and proceed. -- paulus. */ -33: lwarx r8,0,r5 /* fetch the pte */ +#if (PTE_FLAGS_OFFSET != 0) + addi r5,r5,PTE_FLAGS_OFFSET +#endif +33: lwarx r8,0,r5 /* fetch the pte flags word */ andi. r0,r8,_PAGE_HASHPTE beq 8f /* done if HASHPTE is already clear */ rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ @@ -590,7 +644,7 @@ _GLOBAL(flush_hash_patch_B) 8: ble cr1,9f /* if all ptes checked */ 81: addi r6,r6,-1 - addi r5,r5,4 /* advance to next pte */ + addi r5,r5,PTE_SIZE addi r4,r4,0x1000 lwz r0,0(r5) /* check next pte */ cmpwi cr1,r6,1 diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 2001abdb191..c31d6d26f0b 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -73,7 +73,7 @@ extern unsigned long p_mapped_by_tlbcam(unsigned long pa); #endif /* HAVE_TLBCAM */ #ifdef CONFIG_PTE_64BIT -/* 44x uses an 8kB pgdir because it has 8-byte Linux PTEs. */ +/* Some processors use an 8kB pgdir because they have 8-byte Linux PTEs. */ #define PGDIR_ORDER 1 #else #define PGDIR_ORDER 0 @@ -288,7 +288,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags) } /* - * Map in all of physical memory starting at KERNELBASE. + * Map in a big chunk of physical memory starting at KERNELBASE. */ void __init mapin_ram(void) { diff --git a/arch/powerpc/mm/tlb_32.c b/arch/powerpc/mm/tlb_32.c index eb4b512d65f..f9a47fee392 100644 --- a/arch/powerpc/mm/tlb_32.c +++ b/arch/powerpc/mm/tlb_32.c @@ -45,6 +45,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) flush_hash_pages(mm->context.id, addr, ptephys, 1); } } +EXPORT_SYMBOL(flush_hash_entry); /* * Called by ptep_set_access_flags, must flush on CPUs for which the -- cgit v1.2.3 From a880e7623397bcb44877b012cd65baa11ad1bbf8 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 15 Sep 2008 10:43:35 +0000 Subject: powerpc: Avoid integer overflow in page_is_ram() Commit 8b150478 ("ppc: make phys_mem_access_prot() work with pfns instead of addresses") fixed page_is_ram() in arch/ppc to avoid overflow for addresses above 4G on 32-bit kernels. However arch/powerpc's page_is_ram() is missing the same fix -- it computes a physical address by doing pfn << PAGE_SHIFT, which overflows if pfn corresponds to a page above 4G. In particular this causes pages above 4G to be mapped with the wrong caching attribute; for example many ppc440-based SoCs have PCI space above 4G, and mmap()ing MMIO space may end up with a mapping that has caching enabled. Fix this by working with the pfn and avoiding the conversion to physical address that causes the overflow. This patch compares the pfn to max_pfn, which is a semantic change from the old code -- that code compared the physical address to high_memory, which corresponds to max_low_pfn. However, I think that was is another bug, since highmem pages are still RAM. Reported-by: vb Signed-off-by: Roland Dreier Acked-by: Benjamin Herrenschmidt Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 1c93c255873..98d7bf99533 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -75,11 +75,10 @@ static inline pte_t *virt_to_kpte(unsigned long vaddr) int page_is_ram(unsigned long pfn) { - unsigned long paddr = (pfn << PAGE_SHIFT); - #ifndef CONFIG_PPC64 /* XXX for now */ - return paddr < __pa(high_memory); + return pfn < max_pfn; #else + unsigned long paddr = (pfn << PAGE_SHIFT); int i; for (i=0; i < lmb.memory.cnt; i++) { unsigned long base; -- cgit v1.2.3 From 8f64e1f2d1e09267ac926e15090fd505c1c0cbcb Mon Sep 17 00:00:00 2001 From: Jon Tollefson Date: Thu, 9 Oct 2008 10:18:40 +0000 Subject: powerpc: Reserve in bootmem lmb reserved regions that cross NUMA nodes If there are multiple reserved memory blocks via lmb_reserve() that are contiguous addresses and on different NUMA nodes we are losing track of which address ranges to reserve in bootmem on which node. I discovered this when I recently got to try 16GB huge pages on a system with more then 2 nodes. When scanning the device tree in early boot we call lmb_reserve() with the addresses of the 16G pages that we find so that the memory doesn't get used for something else. For example the addresses for the pages could be 4000000000, 4400000000, 4800000000, 4C00000000, etc - 8 pages, one on each of eight nodes. In the lmb after all the pages have been reserved it will look something like the following: lmb_dump_all: memory.cnt = 0x2 memory.size = 0x3e80000000 memory.region[0x0].base = 0x0 .size = 0x1e80000000 memory.region[0x1].base = 0x4000000000 .size = 0x2000000000 reserved.cnt = 0x5 reserved.size = 0x3e80000000 reserved.region[0x0].base = 0x0 .size = 0x7b5000 reserved.region[0x1].base = 0x2a00000 .size = 0x78c000 reserved.region[0x2].base = 0x328c000 .size = 0x43000 reserved.region[0x3].base = 0xf4e8000 .size = 0xb18000 reserved.region[0x4].base = 0x4000000000 .size = 0x2000000000 The reserved.region[0x4] contains the 16G pages. In arch/powerpc/mm/num.c: do_init_bootmem() we loop through each of the node numbers looking for the reserved regions that belong to the particular node. It is not able to identify region 0x4 as being a part of each of the 8 nodes. It is assuming that a reserved region is only on a single node. This patch takes out the reserved region loop from inside the loop that goes over each node. It looks up the active region containing the start of the reserved region. If it extends past that active region then it adjusts the size and gets the next active region containing it. Signed-off-by: Jon Tollefson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 108 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 80 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index be05457631d..6cf5c71c431 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -89,6 +89,46 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, return 0; } +/* + * get_active_region_work_fn - A helper function for get_node_active_region + * Returns datax set to the start_pfn and end_pfn if they contain + * the initial value of datax->start_pfn between them + * @start_pfn: start page(inclusive) of region to check + * @end_pfn: end page(exclusive) of region to check + * @datax: comes in with ->start_pfn set to value to search for and + * goes out with active range if it contains it + * Returns 1 if search value is in range else 0 + */ +static int __init get_active_region_work_fn(unsigned long start_pfn, + unsigned long end_pfn, void *datax) +{ + struct node_active_region *data; + data = (struct node_active_region *)datax; + + if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { + data->start_pfn = start_pfn; + data->end_pfn = end_pfn; + return 1; + } + return 0; + +} + +/* + * get_node_active_region - Return active region containing start_pfn + * @start_pfn: The page to return the region for. + * @node_ar: Returned set to the active region containing start_pfn + */ +static void __init get_node_active_region(unsigned long start_pfn, + struct node_active_region *node_ar) +{ + int nid = early_pfn_to_nid(start_pfn); + + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + work_with_active_regions(nid, get_active_region_work_fn, node_ar); +} + static void __cpuinit map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -882,38 +922,50 @@ void __init do_init_bootmem(void) start_pfn, end_pfn); free_bootmem_with_active_regions(nid, end_pfn); + } - /* Mark reserved regions on this node */ - for (i = 0; i < lmb.reserved.cnt; i++) { - unsigned long physbase = lmb.reserved.region[i].base; - unsigned long size = lmb.reserved.region[i].size; - unsigned long start_paddr = start_pfn << PAGE_SHIFT; - unsigned long end_paddr = end_pfn << PAGE_SHIFT; - - if (early_pfn_to_nid(physbase >> PAGE_SHIFT) != nid && - early_pfn_to_nid((physbase+size-1) >> PAGE_SHIFT) != nid) - continue; - - if (physbase < end_paddr && - (physbase+size) > start_paddr) { - /* overlaps */ - if (physbase < start_paddr) { - size -= start_paddr - physbase; - physbase = start_paddr; - } - - if (size > end_paddr - physbase) - size = end_paddr - physbase; - - dbg("reserve_bootmem %lx %lx\n", physbase, - size); - reserve_bootmem_node(NODE_DATA(nid), physbase, - size, BOOTMEM_DEFAULT); - } + /* Mark reserved regions */ + for (i = 0; i < lmb.reserved.cnt; i++) { + unsigned long physbase = lmb.reserved.region[i].base; + unsigned long size = lmb.reserved.region[i].size; + unsigned long start_pfn = physbase >> PAGE_SHIFT; + unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); + struct node_active_region node_ar; + + get_node_active_region(start_pfn, &node_ar); + while (start_pfn < end_pfn) { + /* + * if reserved region extends past active region + * then trim size to active region + */ + if (end_pfn > node_ar.end_pfn) + size = (node_ar.end_pfn << PAGE_SHIFT) + - (start_pfn << PAGE_SHIFT); + dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, size, + node_ar.nid); + reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase, + size, BOOTMEM_DEFAULT); + /* + * if reserved region is contained in the active region + * then done. + */ + if (end_pfn <= node_ar.end_pfn) + break; + + /* + * reserved region extends past the active region + * get next active region that contains this + * reserved region + */ + start_pfn = node_ar.end_pfn; + physbase = start_pfn << PAGE_SHIFT; + get_node_active_region(start_pfn, &node_ar); } - sparse_memory_present_with_active_regions(nid); } + + for_each_online_node(nid) + sparse_memory_present_with_active_regions(nid); } void __init paging_init(void) -- cgit v1.2.3 From f5ea64dcbad89875d130596df14c9b25d994a737 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Sun, 12 Oct 2008 17:54:24 +0000 Subject: powerpc: Get USE_STRICT_MM_TYPECHECKS working again The typesafe version of the powerpc pagetable handling (with USE_STRICT_MM_TYPECHECKS defined) has bitrotted again. This patch makes a bunch of small fixes to get it back to building status. It's still not enabled by default as gcc still generates worse code with it for some reason. Signed-off-by: David Gibson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/gup.c | 7 ++++--- arch/powerpc/mm/hash_utils_64.c | 4 ++-- arch/powerpc/mm/init_64.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index 9fdf4d6335e..28a114db3ba 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -41,7 +41,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, page = pte_page(pte); if (!page_cache_get_speculative(page)) return 0; - if (unlikely(pte != *ptep)) { + if (unlikely(pte_val(pte) != pte_val(*ptep))) { put_page(page); return 0; } @@ -92,7 +92,7 @@ static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, *nr -= refs; return 0; } - if (unlikely(pte != *ptep)) { + if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ while (*nr) { put_page(page); @@ -237,7 +237,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pgd_t pgd = *pgdp; VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); - pr_debug(" %016lx: normal pgd %p\n", addr, (void *)pgd); + pr_debug(" %016lx: normal pgd %p\n", addr, + (void *)pgd_val(pgd)); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 09db4efe192..5c64af17475 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -541,7 +541,7 @@ static unsigned long __init htab_get_table_size(void) void create_section_mapping(unsigned long start, unsigned long end) { BUG_ON(htab_bolt_mapping(start, end, __pa(start), - PAGE_KERNEL, mmu_linear_psize, + pgprot_val(PAGE_KERNEL), mmu_linear_psize, mmu_kernel_ssize)); } @@ -649,7 +649,7 @@ void __init htab_initialize(void) mtspr(SPRN_SDR1, _SDR1); } - prot = PAGE_KERNEL; + prot = pgprot_val(PAGE_KERNEL); #ifdef CONFIG_DEBUG_PAGEALLOC linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 036fe2f10c7..3e6a6543f53 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -228,8 +228,8 @@ int __meminit vmemmap_populate(struct page *start_page, start, p, __pa(p)); mapped = htab_bolt_mapping(start, start + page_size, __pa(p), - PAGE_KERNEL, mmu_vmemmap_psize, - mmu_kernel_ssize); + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); BUG_ON(mapped < 0); } -- cgit v1.2.3