From 5eb9bac0406f2beb84b21aac6feb89d33d9f3f5c Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:52 +0000 Subject: powerpc: Rearrange SLB preload code With the new top down layout it is likely that the pc and stack will be in the same segment, because the pc is most likely in a library allocated via a top down mmap. Right now we bail out early if these segments match. Rearrange the SLB preload code to sanity check all SLB preload addresses are not in the kernel, then check all addresses for conflicts. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/slb.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5b7038f248b..227056c21ee 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -218,23 +218,18 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) else unmapped_base = TASK_UNMAPPED_BASE_USER64; - if (is_kernel_addr(pc)) - return; - slb_allocate(pc); - - if (esids_match(pc,stack)) + if (is_kernel_addr(pc) || is_kernel_addr(stack) || + is_kernel_addr(unmapped_base)) return; - if (is_kernel_addr(stack)) - return; - slb_allocate(stack); + slb_allocate(pc); - if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base)) - return; + if (!esids_match(pc, stack)) + slb_allocate(stack); - if (is_kernel_addr(unmapped_base)) - return; - slb_allocate(unmapped_base); + if (!esids_match(pc, unmapped_base) && + !esids_match(stack, unmapped_base)) + slb_allocate(unmapped_base); } static inline void patch_slb_encoding(unsigned int *insn_addr, -- cgit v1.2.3 From de4376c2846bb5a8fc6fe8dbd0e4ff30905493e6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:53 +0000 Subject: powerpc: Preload application text segment instead of TASK_UNMAPPED_BASE TASK_UNMAPPED_BASE is not used with the new top down mmap layout. We can reuse this preload slot by loading in the segment at 0x10000000, where almost all PowerPC binaries are linked at. On a microbenchmark that bounces a token between two 64bit processes over pipes and calls gettimeofday each iteration (to access the VDSO), both the 32bit and 64bit context switch rate improves (tested on a 4GHz POWER6): 32bit: 273k/sec -> 283k/sec 64bit: 277k/sec -> 284k/sec Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/slb.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 227056c21ee..6bc8b4aeba5 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -184,7 +184,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; + unsigned long exec_base; if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { @@ -212,14 +212,13 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) /* * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; + exec_base = 0x10000000; if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(unmapped_base)) + is_kernel_addr(exec_base)) return; slb_allocate(pc); @@ -227,9 +226,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) if (!esids_match(pc, stack)) slb_allocate(stack); - if (!esids_match(pc, unmapped_base) && - !esids_match(stack, unmapped_base)) - slb_allocate(unmapped_base); + if (!esids_match(pc, exec_base) && + !esids_match(stack, exec_base)) + slb_allocate(exec_base); } static inline void patch_slb_encoding(unsigned int *insn_addr, -- cgit v1.2.3 From ee43eb788b3a06425fffb912677e2e1c8b00dd3b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:54 +0000 Subject: powerpc: Use names rather than numbers for SPRGs (v2) The kernel uses SPRG registers for various purposes, typically in low level assembly code as scratch registers or to hold per-cpu global infos such as the PACA or the current thread_info pointer. We want to be able to easily shuffle the usage of those registers as some implementations have specific constraints realted to some of them, for example, some have userspace readable aliases, etc.. and the current choice isn't always the best. This patch should not change any code generation, and replaces the usage of SPRN_SPRGn everywhere in the kernel with a named replacement and adds documentation next to the definition of the names as to what those are used for on each processor family. The only parts that still use the original numbers are bits of KVM or suspend/resume code that just blindly needs to save/restore all the SPRGs. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_low_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 14af8cedab7..b13d58932bf 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -40,7 +40,7 @@ mmu_hash_lock: * The address is in r4, and r3 contains an access flag: * _PAGE_RW (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG3 contains the physical address of the current task's thread. + * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE @@ -68,7 +68,7 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ -- cgit v1.2.3 From fcce810986b3f32a8322faf240f8cc5560a4c463 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:10 +0000 Subject: powerpc/mm: Add HW threads support to no_hash TLB management The current "no hash" MMU context management code is written with the assumption that one CPU == one TLB. This is not the case on implementations that support HW multithreading, where several linux CPUs can share the same TLB. This adds some basic support for this to our context management and our TLB flushing code. It also cleans up the optional debugging output a bit Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_nohash.c | 93 ++++++++++++++++++++++++------------ arch/powerpc/mm/tlb_nohash.c | 10 +++- 2 files changed, 70 insertions(+), 33 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index b1a727def15..834436d6d6b 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -25,10 +25,20 @@ * also clear mm->cpu_vm_mask bits when processes are migrated */ -#undef DEBUG -#define DEBUG_STEAL_ONLY -#undef DEBUG_MAP_CONSISTENCY -/*#define DEBUG_CLAMP_LAST_CONTEXT 15 */ +#define DEBUG_MAP_CONSISTENCY +#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif #include #include @@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock); static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; - unsigned int cpu, max; + unsigned int cpu, max, i; max = last_context - first_context; @@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id) id = first_context; continue; } - pr_devel("[%d] steal context %d from mm @%p\n", - smp_processor_id(), id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; - /* Mark it stale on all CPUs that used this mm */ - for_each_cpu(cpu, mm_cpumask(mm)) - __set_bit(id, stale_map[cpu]); + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) + __set_bit(id, stale_map[i]); + cpu = i - 1; + } return id; } @@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id) /* Pick up the victim mm */ mm = context_mm[id]; - pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Flush the TLB for that context */ local_flush_tlb_mm(mm); @@ -179,19 +196,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* No lockless fast path .. yet */ spin_lock(&context_lock); -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", - cpu, next, next->context.active, next->context.id); -#endif + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); #ifdef CONFIG_SMP /* Mark us active and the previous one not anymore */ next->context.active++; if (prev) { -#ifndef DEBUG_STEAL_ONLY - pr_devel(" old context %p active was: %d\n", - prev, prev->context.active); -#endif + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); WARN_ON(prev->context.active < 1); prev->context.active--; } @@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* If we already have a valid assigned context, skip all that */ id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif goto ctxt_ok; + } /* We really don't have a context, let's try to acquire one */ id = next_context; @@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) next_context = id + 1; context_mm[id] = next; next->context.id = id; - -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] picked up new id %d, nrf is now %d\n", - cpu, id, nr_free_contexts); -#endif + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); context_check_map(); ctxt_ok: @@ -248,15 +262,20 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) * local TLB for it and unmark it before we use it */ if (test_bit(id, stale_map[cpu])) { - pr_devel("[%d] flushing stale context %d for mm @%p !\n", - cpu, id, next); + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_in_core(cpu), + cpu_last_thread_in_core(cpu)); + local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - __clear_bit(id, stale_map[cpu]); + for (cpu = cpu_first_thread_in_core(cpu); + cpu <= cpu_last_thread_in_core(cpu); cpu++) + __clear_bit(id, stale_map[cpu]); } /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); set_context(id, next->pgd); spin_unlock(&context_lock); } @@ -266,6 +285,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { + pr_hard("initing context for mm @%p\n", mm); + mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; @@ -305,7 +326,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; - +#ifdef CONFIG_HOTPLUG_CPU + struct task_struct *p; +#endif /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ @@ -324,8 +347,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); kfree(stale_map[cpu]); stale_map[cpu] = NULL; - break; -#endif + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + read_lock(&tasklist_lock); + for_each_process(p) { + if (p->mm) + cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm)); + } + read_unlock(&tasklist_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ad2eb4d34dd..d908e75cc3b 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -87,6 +87,12 @@ EXPORT_SYMBOL(local_flush_tlb_page); static DEFINE_SPINLOCK(tlbivax_lock); +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_thread_cpumask(smp_processor_id())); +} + struct tlb_flush_param { unsigned long addr; unsigned int pid; @@ -131,7 +137,7 @@ void flush_tlb_mm(struct mm_struct *mm) pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; - if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { struct tlb_flush_param p = { .pid = pid }; /* Ignores smp_processor_id() even if set. */ smp_call_function_many(mm_cpumask(mm), @@ -153,7 +159,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; cpu_mask = mm_cpumask(vma->vm_mm); - if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); -- cgit v1.2.3 From a245067e204f69c69abf92d94fc45ec65bf1f07e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:16 +0000 Subject: powerpc/mm: Add support for early ioremap on non-hash 64-bit processors This adds some code to do early ioremap's using page tables instead of bolting entries in the hash table. This will be used by the upcoming 64-bits BookE port. The patch also changes the test for early vs. late ioremap to use slab_is_available() instead of our old hackish mem_init_done. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_64.c | 59 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index bfa7db6b2fd..93ed1a3c872 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -55,19 +57,36 @@ unsigned long ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PPC_MMU_NOHASH +static void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + if (init_bootmem_done) + pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); + else + pt = __va(lmb_alloc_base(size, size, + __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} +#endif /* CONFIG_PPC_MMU_NOHASH */ + /* - * map_io_page currently only called by __ioremap - * map_io_page adds an entry to the ioremap page table + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_io_page(unsigned long ea, unsigned long pa, int flags) +static int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - if (mem_init_done) { + if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } else { +#ifdef CONFIG_PPC_MMU_NOHASH + /* Warning ! This will blow up if bootmem is not initialized + * which our ppc64 code is keen to do that, we'll need to + * fix it and/or be more careful + */ + pgdp = pgd_offset_k(ea); +#ifdef PUD_TABLE_SIZE + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* PUD_TABLE_SIZE */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); +#else /* CONFIG_PPC_MMU_NOHASH */ /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an @@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) "memory at %016lx !\n", pa); return -ENOMEM; } +#endif /* !CONFIG_PPC_MMU_NOHASH */ } return 0; } @@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_io_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) return NULL; return (void __iomem *)ea; -- cgit v1.2.3 From d4e167da4cb60910f6ac305aee03714937f70b71 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:24 +0000 Subject: powerpc/mm: Make low level TLB flush ops on BookE take additional args We need to pass down whether the page is direct or indirect and we'll need to pass the page size to _tlbil_va and _tlbivax_bcast We also add a new low level _tlbil_pid_noind() which does a TLB flush by PID but avoids flushing indirect entries if possible This implements those new prototypes but defines them with inlines or macros so that no additional arguments are actually passed on current processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_decl.h | 16 ++++++++++++--- arch/powerpc/mm/tlb_nohash.c | 42 +++++++++++++++++++++++++++++----------- arch/powerpc/mm/tlb_nohash_low.S | 6 +++--- 3 files changed, 47 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d1f9c62dc17..3871dceee2d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -36,21 +36,30 @@ static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); } +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ #ifdef CONFIG_8xx -static inline void _tlbil_va(unsigned long address, unsigned int pid) +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } #else /* CONFIG_8xx */ -extern void _tlbil_va(unsigned long address, unsigned int pid); +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} #endif /* CONIFG_8xx */ /* @@ -58,7 +67,8 @@ extern void _tlbil_va(unsigned long address, unsigned int pid); * implementation. When that becomes the case, this will be * an extern. */ -static inline void _tlbivax_bcast(unsigned long address, unsigned int pid) +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { BUG(); } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d908e75cc3b..761e8882416 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -67,18 +67,24 @@ void local_flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(local_flush_tlb_mm); -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (pid != MMU_NO_CONTEXT) - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); preempt_enable(); } -EXPORT_SYMBOL(local_flush_tlb_page); +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); /* * And here are the SMP non-local implementations @@ -96,6 +102,8 @@ static int mm_is_core_local(struct mm_struct *mm) struct tlb_flush_param { unsigned long addr; unsigned int pid; + unsigned int tsize; + unsigned int ind; }; static void do_flush_tlb_mm_ipi(void *param) @@ -109,7 +117,7 @@ static void do_flush_tlb_page_ipi(void *param) { struct tlb_flush_param *p = param; - _tlbil_va(p->addr, p->pid); + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); } @@ -149,37 +157,49 @@ void flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(flush_tlb_mm); -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { struct cpumask *cpu_mask; unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; - cpu_mask = mm_cpumask(vma->vm_mm); + cpu_mask = mm_cpumask(mm); if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); if (lock) spin_lock(&tlbivax_lock); - _tlbivax_bcast(vmaddr, pid); + _tlbivax_bcast(vmaddr, pid, tsize, ind); if (lock) spin_unlock(&tlbivax_lock); goto bail; } else { - struct tlb_flush_param p = { .pid = pid, .addr = vmaddr }; + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; /* Ignores smp_processor_id() even if set in cpu_mask */ smp_call_function_many(cpu_mask, do_flush_tlb_page_ipi, &p, 1); } } - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); bail: preempt_enable(); } + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} EXPORT_SYMBOL(flush_tlb_page); #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 3037911279b..c7d89a0adba 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -39,7 +39,7 @@ /* * 40x implementation needs only tlbil_va */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) /* We run the search with interrupts disabled because we have to change * the PID and I don't want to preempt when that happens. */ @@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va) * 440 implementation uses tlbsx/we for tlbil_va and a full sweep * of the TLB for everything else. */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfspr r5,SPRN_MMUCR rlwimi r5,r4,0,24,31 /* Set TID */ @@ -170,7 +170,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) * Flush MMU TLB for a particular address, but only on the local processor * (no broadcast) */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfmsr r10 wrteei 0 slwi r4,r4,16 -- cgit v1.2.3 From c7cc58a1ad8dfe3c199d3b6ce50412b86dd3edaf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:28 +0000 Subject: powerpc/mm: Rework & cleanup page table freeing code path That patch used to just add a hook to page table flushing but pulling that string brought out a whole bunch of issues, so it now does that and more: - We now make the RCU batching of page freeing SMP only, as I believe it was intended initially. We make a few more things compile to nothing on !CONFIG_SMP - Some macros are turned into functions, though that forced me to out of line a few stuffs due to unsolvable include depenencies, however it's probably better that way anyway, it's not -that- critical code path. - 32-bit didn't call pte_free_finish() on tlb_flush() which means that it wouldn't push out the batch to RCU for delayed freeing when a bunch of page tables have been freed, they would just stay in there until the batch gets full. 64-bit BookE will use that hook to maintain the virtually linear page tables or the indirect entries in the TLB when using the HW loader. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable.c | 10 ++++++++++ arch/powerpc/mm/tlb_hash32.c | 3 +++ arch/powerpc/mm/tlb_hash64.c | 15 +++++++++++++++ arch/powerpc/mm/tlb_nohash.c | 8 ++++++++ 4 files changed, 36 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 627767d6169..a65979a5f75 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,14 @@ #include #include +#ifdef CONFIG_SMP + +/* + * Handle batching of page table freeing on SMP. Page tables are + * queued up and send to be freed later by RCU in order to avoid + * freeing a page table page that is being walked without locks + */ + static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; @@ -116,6 +124,8 @@ void pte_free_finish(void) *batchp = NULL; } +#endif /* CONFIG_SMP */ + /* * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() */ diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 65190587a36..8aaa8b7eb32 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb) */ _tlbia(); } + + /* Push out batch of freed page tables */ + pte_free_finish(); } /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 937eb90677d..8e35a606693 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -154,6 +154,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } +void tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); + + /* If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} + /** * __flush_hash_table_range - Flush all HPTEs for a given address range * from the hash table (and the TLB). But keeps diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 761e8882416..6b43fc49f10 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -233,3 +233,11 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_mm(vma->vm_mm); } EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} -- cgit v1.2.3 From 57e2a99f74b0d3720c97a6aadb57ae6aad3c61ea Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Jul 2009 11:59:34 +1000 Subject: powerpc: Add memory management headers for new 64-bit BookE This adds the PTE and pgtable format definitions, along with changes to the kernel memory map and other definitions related to implementing support for 64-bit Book3E. This also shields some asm-offset bits that are currently only relevant on 32-bit We also move the definition of the "linux" page size constants to the common mmu.h file and add a few sizes that are relevant to embedded processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugetlbpage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c46ef2ffa3d..90df6ffe3a4 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - "unused_4K", "hugepte_cache_64K", "unused_64K_AP", - "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" + [MMU_PAGE_64K] = "hugepte_cache_64K", + [MMU_PAGE_1M] = "hugepte_cache_1M", + [MMU_PAGE_16M] = "hugepte_cache_16M", + [MMU_PAGE_16G] = "hugepte_cache_16G", }; /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() @@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize) if (mmu_huge_psizes[psize] || mmu_psize_defs[psize].shift == PAGE_SHIFT) return; + if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) + return; hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); switch (mmu_psize_defs[psize].shift) { -- cgit v1.2.3 From a8f7758c1c52a13e031266483efd5525157e43e9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:45 +0000 Subject: powerpc/mm: Move around mmu_gathers definition on 64-bit The definition for the global structure mmu_gathers, used by generic code, is currently defined in multiple places not including anything used by 64-bit Book3E. This changes it by moving to one place common to all processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_32.c | 2 -- arch/powerpc/mm/pgtable.c | 2 ++ arch/powerpc/mm/tlb_hash64.c | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3de6a0d9382..3ef5084b90c 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -54,8 +54,6 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - phys_addr_t total_memory; phys_addr_t total_lowmem; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a65979a5f75..cafb2a26954 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,8 @@ #include #include +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + #ifdef CONFIG_SMP /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 8e35a606693..2b2f35f6985 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -33,11 +33,6 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); -/* This is declared as we are using the more or less generic - * arch/powerpc/include/asm/tlb.h file -- tgall - */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* * A linux PTE was changed and the corresponding hash table entry * neesd to be flushed. This function will either perform the flush -- cgit v1.2.3 From 25d21ad6e799cccd097b9df2a2fefe19a7e1dfcf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:47 +0000 Subject: powerpc: Add TLB management code for 64-bit Book3E This adds the TLB miss handler assembly, the low level TLB flush routines along with the necessary hook for dealing with our virtual page tables or indirect TLB entries that need to be flushes when PTE pages are freed. There is currently no support for hugetlbfs Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_decl.h | 14 +- arch/powerpc/mm/tlb_low_64e.S | 734 +++++++++++++++++++++++++++++++++++++++ arch/powerpc/mm/tlb_nohash.c | 203 ++++++++++- arch/powerpc/mm/tlb_nohash_low.S | 79 +++++ 4 files changed, 1025 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/mm/tlb_low_64e.S (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 3871dceee2d..5961c6b739d 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -41,7 +41,11 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* @@ -53,7 +57,10 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } -#else /* CONFIG_8xx */ +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else extern void __tlbil_va(unsigned long address, unsigned int pid); static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) @@ -67,11 +74,16 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, * implementation. When that becomes the case, this will be * an extern. */ +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { BUG(); } +#endif #else /* CONFIG_PPC_MMU_NOHASH */ diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S new file mode 100644 index 00000000000..10d524ded7b --- /dev/null +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -0,0 +1,734 @@ +/* + * Low leve TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + + /* Set the TLB reservation and seach for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,r16) + ld r14,0(r10) + beq normal_tlb_miss_done + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: mtspr SPRN_MAS7_MAS3,r15 + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_HWEXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,r16) + beq virt_page_table_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +virt_page_table_tlb_miss_done: + + /* We have overriden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retreive + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant informations in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,r16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + cmpld cr0,r10,r16 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB< - * IBM Corp. + * Copyright 2008,2009 Ben Herrenschmidt + * IBM Corp. * * Derived from arch/ppc/mm/init.c: * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -34,12 +34,70 @@ #include #include #include +#include #include #include +#include #include "mmu_decl.h" +#ifdef CONFIG_PPC_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ +unsigned long linear_map_top; /* Top of linear mapping */ + +#endif /* CONFIG_PPC64 */ + /* * Base TLB flushing operations: * @@ -82,7 +140,7 @@ void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(local_flush_tlb_page); @@ -198,7 +256,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(flush_tlb_page); @@ -241,3 +299,140 @@ void tlb_flush(struct mmu_gather *tlb) /* Push out batch of freed page tables */ pte_free_finish(); } + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_enabled) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void __early_init_mmu(int boot_cpu) +{ + extern unsigned int interrupt_base_book3e; + extern unsigned int exc_data_tlb_miss_htw_book3e; + extern unsigned int exc_instruction_tlb_miss_htw_book3e; + + unsigned int *ibase = &interrupt_base_book3e; + unsigned int mas4; + + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it + */ + mmu_linear_psize = MMU_PAGE_1G; + + + /* Check if HW tablewalk is present, and if yes, enable it by: + * + * - patching the TLB miss handlers to branch to the + * one dedicates to it + * + * - setting the global book3e_htw_enabled + * + * - Set MAS4:INDD and default page size + */ + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + if (boot_cpu) { + unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG); + + /* Check if HW loader is supported */ + if ((tlb0cfg & TLBnCFG_IND) && + (tlb0cfg & TLBnCFG_PT)) { + patch_branch(ibase + (0x1c0 / 4), + (unsigned long)&exc_data_tlb_miss_htw_book3e, 0); + patch_branch(ibase + (0x1e0 / 4), + (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0); + book3e_htw_enabled = 1; + } + pr_info("MMU: Book3E Page Tables %s\n", + book3e_htw_enabled ? "Enabled" : "Disabled"); + } + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + if (book3e_htw_enabled) { + mas4 |= mas4 | MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + } else { +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + } + mtspr(SPRN_MAS4, mas4); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = lmb_end_of_DRAM(); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +void __init early_init_mmu(void) +{ + __early_init_mmu(1); +} + +void __cpuinit early_init_mmu_secondary(void) +{ + __early_init_mmu(0); +} + +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index c7d89a0adba..7bcd9fbf6cc 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -191,6 +191,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,r3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,r3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr #else #error Unsupported processor type ! #endif -- cgit v1.2.3 From 32a74949b7337726e76d69f51c48715431126c6c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:58 +0000 Subject: powerpc/mm: Add support for SPARSEMEM_VMEMMAP on 64-bit Book3E The base TLB support didn't include support for SPARSEMEM_VMEMMAP, though we did carve out some virtual space for it, the necessary support code wasn't there. This implements it by using 16M pages for now, though the page size could easily be changed at runtime if necessary. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_64.c | 55 ++++++++++++++++++++++++++++++++++++++------ arch/powerpc/mm/mmu_decl.h | 7 +++++- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/tlb_nohash.c | 11 ++++++++- 4 files changed, 65 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 68a821add28..31582329cd6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + PAGE_KERNEL, mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif /* CONFIG_PPC_BOOK3E */ + int __meminit vmemmap_populate(struct page *start_page, unsigned long nr_pages, int node) { @@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page, /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); + pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", + start_page, nr_pages, node); + pr_debug(" -> map %lx..%lx\n", start, end); + for (; start < end; start += page_size) { - int mapped; void *p; if (vmemmap_populated(start, page_size)) @@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page, if (!p) return -ENOMEM; - pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", - start, p, __pa(p)); + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); - mapped = htab_bolt_mapping(start, start + page_size, __pa(p), - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - BUG_ON(mapped < 0); + vmemmap_create_mapping(start, page_size, __pa(p)); } return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 5961c6b739d..d2e5321d5ea 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -121,7 +121,12 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; -#endif + +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 93ed1a3c872..853d5565eed 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -79,7 +79,7 @@ static void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_kernel_page(unsigned long ea, unsigned long pa, int flags) +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d16100c9416..2fbc680c2c7 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -93,6 +93,7 @@ static inline int mmu_get_tsize(int psize) int mmu_linear_psize; /* Page size used for the linear mapping */ int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ unsigned long linear_map_top; /* Top of linear mapping */ @@ -356,10 +357,18 @@ static void __early_init_mmu(int boot_cpu) unsigned int mas4; /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) */ mmu_linear_psize = MMU_PAGE_1G; + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + */ + mmu_vmemmap_psize = MMU_PAGE_16M; /* Check if HW tablewalk is present, and if yes, enable it by: * -- cgit v1.2.3 From 2d27cfd3286966c04d4192a9db5a6c7ea60eebf1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:59 +0000 Subject: powerpc: Remaining 64-bit Book3E support This contains all the bits that didn't fit in previous patches :-) This includes the actual exception handlers assembly, the changes to the kernel entry, other misc bits and wiring it all up in Kconfig. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3e68363405b..6fb8fc8d2fe 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,6 +13,7 @@ obj-y := fault.o mem.o pgtable.o gup.o \ pgtable_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o obj-$(CONFIG_PPC64) += mmap_64.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ -- cgit v1.2.3 From 67050b5c3e9992d98554bd224d5a7898cc4881ff Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 4 Aug 2009 22:33:32 -0500 Subject: powerpc/mm: Fix switch_mmu_context to iterate of the proper list of cpus Introduced a temporary variable into our iterating over the list cpus that are threads on the same core. For some reason Ben forgot how for loops work. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_nohash.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 834436d6d6b..c2f93dc470e 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -190,7 +190,7 @@ static void context_check_map(void) { } void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - unsigned int id, cpu = smp_processor_id(); + unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; /* No lockless fast path .. yet */ @@ -269,9 +269,10 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - for (cpu = cpu_first_thread_in_core(cpu); - cpu <= cpu_last_thread_in_core(cpu); cpu++) - __clear_bit(id, stale_map[cpu]); + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) { + __clear_bit(id, stale_map[i]); + } } /* Flick the MMU and release lock */ -- cgit v1.2.3 From 8dcd038a13b8e322c49fe0d3e31a0deaba4fd5fd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 16:00:37 -0700 Subject: powerpc/fsl-booke: read buffer overflow cam[tlbcam_index] is checked before tlbcam_index < ARRAY_SIZE(cam) Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index bb3d65998e6..dc93e95b256 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void) unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; - while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) { + while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); virt += cam[tlbcam_index]; phys += cam[tlbcam_index]; -- cgit v1.2.3 From 797a747a82e23530ee45d2927bf84f3571c1acb2 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 15:21:40 +0000 Subject: powerpc/mm: Fix assert_pte_locked to work properly on uniprocessor Since the pte_lockptr is a spinlock it gets optimized away on uniprocessor builds so using spin_is_locked is not correct. We can use assert_spin_locked instead and get the proper behavior between UP and SMP builds. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cafb2a26954..b6b32487e74 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -254,7 +254,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); BUG_ON(!pmd_present(*pmd)); - BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd))); + assert_spin_locked(pte_lockptr(mm, pmd)); } #endif /* CONFIG_DEBUG_VM */ -- cgit v1.2.3 From fc4bdb35fba1c8f464fd85b94a5059e752fc85d4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 14 Aug 2009 09:38:34 -0500 Subject: powerpc/booke: Move MMUCSR definition into mmu-book3e.h The MMUCSR is now defined as part of the Book-3E architecture so we can move it into mmu-book3e.h and add some of the additional bits defined by the architecture specs. Signed-off-by: Kumar Gala --- arch/powerpc/mm/tlb_nohash_low.S | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 7bcd9fbf6cc..bbdc5b577b8 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -124,8 +124,6 @@ _GLOBAL(_tlbil_pid) * to have the larger code path before the _SECTION_ELSE */ -#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ - MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) /* * Flush MMU TLB on the local processor */ -- cgit v1.2.3 From ea3cc330ac0cd521ff07c7cd432a1848c19a7e92 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 18 Aug 2009 19:00:34 +0000 Subject: powerpc/mm: Cleanup handling of execute permission This is an attempt at cleaning up a bit the way we handle execute permission on powerpc. _PAGE_HWEXEC is gone, _PAGE_EXEC is now only defined by CPUs that can do something with it, and the myriad of #ifdef's in the I$/D$ coherency code is reduced to 2 cases that hopefully should cover everything. The logic on BookE is a little bit different than what it was though not by much. Since now, _PAGE_EXEC will be set by the generic code for executable pages, we need to filter out if they are unclean and recover it. However, I don't expect the code to be more bloated than it already was in that area due to that change. I could boast that this brings proper enforcing of per-page execute permissions to all BookE and 40x but in fact, we've had that now for some time as a side effect of my previous rework in that area (and I didn't even know it :-) We would only enable execute permission if the page was cache clean and we would only cache clean it if we took and exec fault. Since we now enforce that the later only work if VM_EXEC is part of the VMA flags, we de-fact already enforce per-page execute permissions... Unless I missed something Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/40x_mmu.c | 4 +- arch/powerpc/mm/pgtable.c | 167 +++++++++++++++++++++++++++++------------- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/tlb_low_64e.S | 4 +- 4 files changed, 121 insertions(+), 56 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 29954dc2894..f5e7b9ce63d 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_16M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp++) = val; @@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_4M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp) = val; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index b6b32487e74..83f1551ec2c 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -128,28 +128,6 @@ void pte_free_finish(void) #endif /* CONFIG_SMP */ -/* - * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() - */ -static pte_t do_dcache_icache_coherency(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - struct page *page; - - if (unlikely(!pfn_valid(pfn))) - return pte; - page = pfn_to_page(pfn); - - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - pr_devel("do_dcache_icache_coherency... flushing\n"); - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - else - pr_devel("do_dcache_icache_coherency... already clean\n"); - return __pte(pte_val(pte) | _PAGE_HWEXEC); -} - static inline int is_exec_fault(void) { return current->thread.regs && TRAP(current->thread.regs) == 0x400; @@ -157,49 +135,139 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == - (_PAGE_PRESENT); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); } -#if defined(CONFIG_PPC_STD_MMU) +struct page * maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + /* Server-style MMU handles coherency when hashing if HW exec permission - * is supposed per page (currently 64-bit only). Else, we always flush - * valid PTEs in set_pte. + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_pte_filter(pte_t pte) { - return set_pte && pte_looks_normal(pte) && - !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE)); + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; } -#elif _PAGE_HWEXEC == 0 -/* Embedded type MMU without HW exec support (8xx only so far), we flush - * the cache for any present PTE - */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) { - return set_pte && pte_looks_normal(pte); + return pte; } -#else -/* Other embedded CPUs with HW exec support per-page, we flush on exec - * fault if HWEXEC is not set + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) +static pte_t set_pte_filter(pte_t pte) { - return pte_looks_normal(pte) && is_exec_fault() && - !(pte_val(pte) & _PAGE_HWEXEC); + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); } -#endif + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) { #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(*ptep)); @@ -208,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte * this context might not have been activated yet when this * is called. */ - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_need_exec_flush(pte, 1)) - pte = do_dcache_icache_coherency(pte); + pte = set_pte_filter(pte); /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); @@ -227,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed; - if (!dirty && pte_need_exec_flush(entry, 0)) - entry = do_dcache_icache_coherency(entry); + entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { if (!(vma->vm_flags & VM_HUGETLB)) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5422169626b..cb96cb2e17c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) flags |= _PAGE_DIRTY | _PAGE_HWWRITE; /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); + flags &= ~(_PAGE_USER | _PAGE_EXEC); return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 10d524ded7b..cd92f62f9cf 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -133,7 +133,7 @@ /* We do the user/kernel test for the PID here along with the RW test */ - li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h cmpldi cr0,r15,0 /* Check for user region */ @@ -256,7 +256,7 @@ normal_tlb_miss_done: normal_tlb_miss_access_fault: /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_HWEXEC + andi. r10,r11,_PAGE_EXEC bne 1f ld r14,EX_TLB_DEAR(r12) ld r15,EX_TLB_ESR(r12) -- cgit v1.2.3 From df5d6ecf8157245ef733db87597adb2c6e2510da Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 24 Aug 2009 15:52:48 +0000 Subject: powerpc/mm: Add MMU features for TLB reservation & Paired MAS registers Support for TLB reservation (or TLB Write Conditional) and Paired MAS registers are optional for a processor implementation so we handle them via MMU feature sections. We currently only used paired MAS registers to access the full RPN + perm bits that are kept in MAS7||MAS3. We assume that if an implementation has hardware page table at this time it also implements in TLB reservations. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/tlb_low_64e.S | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index cd92f62f9cf..ef1cccf7117 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -189,12 +189,16 @@ normal_tlb_miss: clrrdi r14,r14,3 or r10,r15,r14 +BEGIN_MMU_FTR_SECTION /* Set the TLB reservation and seach for existing entry. Then load * the entry. */ PPC_TLBSRX_DOT(0,r16) ld r14,0(r10) beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ @@ -241,7 +245,14 @@ finish_normal_tlb_miss: bne 1f li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 -1: mtspr SPRN_MAS7_MAS3,r15 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -311,11 +322,13 @@ virt_page_table_tlb_miss: rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 1: +BEGIN_MMU_FTR_SECTION /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. */ PPC_TLBSRX_DOT(0,r16) beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Now, we need to walk the page tables. First check if we are in * range. @@ -367,10 +380,18 @@ virt_page_table_tlb_miss: */ clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe +BEGIN_MMU_FTR_SECTION virt_page_table_tlb_miss_done: /* We have overriden MAS2:EPN but currently our primary TLB miss @@ -394,6 +415,7 @@ virt_page_table_tlb_miss_done: addi r10,r11,-4 std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) 1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); TLB_MISS_EPILOG_SUCCESS @@ -618,7 +640,14 @@ htw_tlb_miss: #else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) #endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -700,7 +729,14 @@ tlb_load_linear: clrrdi r10,r16,30 /* 1G page index */ clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe -- cgit v1.2.3 From 46db2f86a3b2a94e0b33e0b4548fb7b7b6bdff66 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 28 Aug 2009 12:06:29 +0000 Subject: powerpc/pseries: Fix to handle slb resize across migration The SLB can change sizes across a live migration, which was not being handled, resulting in possible machine crashes during migration if migrating to a machine which has a smaller max SLB size than the source machine. Fix this by first reducing the SLB size to the minimum possible value, which is 32, prior to migration. Then during the device tree update which occurs after migration, we make the call to ensure the SLB gets updated. Also add the slb_size to the lparcfg output so that the migration tools can check to make sure the kernel has this capability before allowing migration in scenarios where the SLB size will change. BenH: Fixed #include -> to avoid breaking ppc32 build Signed-off-by: Brian King Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/slb.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 07961c5c169..1d98ecc8eec 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -249,14 +249,22 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) static inline void patch_slb_encoding(unsigned int *insn_addr, unsigned int immed) { - /* Assume the instruction had a "0" immediate value, just - * "or" in the new value - */ - *insn_addr |= immed; + *insn_addr = (*insn_addr & 0xffff0000) | immed; flush_icache_range((unsigned long)insn_addr, 4+ (unsigned long)insn_addr); } +void slb_set_size(u16 size) +{ + extern unsigned int *slb_compare_rr_to_size; + + if (mmu_slb_size == size) + return; + + mmu_slb_size = size; + patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); +} + void slb_initialize(void) { unsigned long linear_llp, vmalloc_llp, io_llp; -- cgit v1.2.3