From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75..1f524df68b9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; int write, si_code; int fault; + unsigned long *stackend; + #ifdef CONFIG_X86_64 unsigned long flags; #endif @@ -850,6 +853,10 @@ no_context: show_fault_oops(regs, error_code, address); + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3 From dacf7333571d770366bff74d10b56aa545434605 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Jan 2009 17:26:35 +0530 Subject: x86: smp.h move zap_low_mappings declartion to tlbflush.h Impact: cleanup, moving NON-SMP stuff from smp.h Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f99a6c6c432..a9dd0b7ad61 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,7 +49,6 @@ #include #include #include -#include unsigned int __VMALLOC_RESERVE = 128 << 20; -- cgit v1.2.3 From 92181f190b649f7ef2b79cbf5c00f26ccc66da2a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 20 Jan 2009 04:24:26 +0100 Subject: x86: optimise x86's do_page_fault (C entry point for the page fault path) Impact: cleanup, restructure code to improve assembly gcc isn't _all_ that smart about spilling registers to stack or reusing stack slots, even with branch annotations. do_page_fault contained a lot of functionality, so split unlikely paths into their own functions, and mark them as noinline just to be sure. I consider this actually to be somewhat of a cleanup too: the main function now contains about half the number of lines so the normal path is easier to read, while the error cases are also nicely split away. Also, ensure the order of arguments to functions is always the same: regs, addr, error_code. This can reduce code size a tiny bit, and just looks neater too. And add a couple of branch annotations. Before: do_page_fault: subq $360, %rsp #, After: do_page_fault: subq $56, %rsp #, bloat-o-meter: add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542) function old new delta __bad_area_nosemaphore - 506 +506 no_context - 474 +474 vmalloc_fault - 424 +424 spurious_fault - 358 +358 mm_fault_error - 272 +272 bad_area_access_error - 89 +89 bad_area - 89 +89 bad_area_nosemaphore - 10 +10 do_page_fault 2464 784 -1680 Yes, the total size increases by 542 bytes, due to the extra function calls. But these will very rarely be called (except for vmalloc_fault) in a normal workload. Importantly, do_page_fault is less than 1/3rd it's original size, and touches far less stack. Existing gotos and branch hints did move a lot of the infrequently used text out of the fastpath, but that's even further improved after this patch. Signed-off-by: Nick Piggin Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 438 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 256 insertions(+), 182 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae511a4..033292dc9e2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { unsigned char *instr; int scan_more = 1; @@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { unsigned long flags = oops_begin(); int sig = SIGKILL; - struct task_struct *tsk; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); tsk = current; tsk->thread.cr2 = address; @@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long flags; + int sig; +#endif + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); +#endif +} + +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); +} + +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, error_code, address)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) + out_of_memory(regs, error_code, address); + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); +} + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) @@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address, * * This assumes no large pages in there. */ -static int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { #ifdef CONFIG_X86_32 unsigned long pgd_paddr; @@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address) int show_unhandled_signals = 1; +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ + return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } + + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -583,16 +786,12 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long address; - int write, si_code; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif tsk = current; mm = tsk->mm; @@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ address = read_cr2(); - si_code = SEGV_MAPERR; - - if (notify_page_fault(regs)) + if (unlikely(notify_page_fault(regs))) return; if (unlikely(kmmio_fault(regs, address))) return; @@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + if (spurious_fault(error_code, address)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ - goto bad_area_nosemaphore; + bad_area_nosemaphore(regs, error_code, address); + return; } - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); + pgtable_bad(regs, error_code, address); #endif /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to @@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. @@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* @@ -738,11 +937,8 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; @@ -760,128 +956,6 @@ good_area: } #endif up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } DEFINE_SPINLOCK(pgd_lock); -- cgit v1.2.3 From 55f4949f5765e7a29863b6d17a774601810732f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:08:53 +0100 Subject: x86, mm: move tlb.c to arch/x86/mm/ Impact: cleanup Now that it's unified, move the (SMP) TLB flushing code from arch/x86/kernel/ to arch/x86/mm/, where it belongs logically. Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 + arch/x86/mm/tlb.c | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 arch/x86/mm/tlb.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d8cc96a2738..9f05157220f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,6 +1,8 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o +obj-$(CONFIG_X86_SMP) += tlb.o + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 00000000000..b3ca1b94065 --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,296 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + +#include +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + struct mm_struct *flush_mm; + unsigned long flush_va; + spinlock_t tlbstate_lock; + DECLARE_BITMAP(flush_cpumask, NR_CPUS); + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +/* + * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop + * but still used for documentation purpose but the usage is slightly + * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt + * entry calls in with the first parameter in %eax. Maybe define + * intrlinkage? + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned int cpu; + unsigned int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); + + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); + smp_mb__after_clear_bit(); + inc_irq_stat(irq_tlb_count); +} + +static void flush_tlb_others_ipi(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + unsigned int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpumask_andnot(to_cpumask(f->flush_cpumask), + cpumask, cpumask_of(smp_processor_id())); + + /* + * Make the above memory operations globally visible before + * sending the IPI. + */ + smp_mb(); + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + if (is_uv_system()) { + unsigned int cpu; + + cpu = get_cpu(); + cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + if (cpumask) + flush_tlb_others_ipi(cpumask, mm, va); + put_cpu(); + return; + } + flush_tlb_others_ipi(cpumask, mm, va); +} + +static int __cpuinit init_smp_flush(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); + + return 0; +} +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + local_flush_tlb(); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} -- cgit v1.2.3 From 4ec71fa2d2c3f1040348f2604f4b8ccc833d1c2e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:24:27 +0100 Subject: x86: uv cleanup, build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: arch/x86/mm/srat_64.c: In function ‘acpi_numa_processor_affinity_init’: arch/x86/mm/srat_64.c:141: error: implicit declaration of function ‘get_uv_system_type’ arch/x86/mm/srat_64.c:141: error: ‘UV_X2APIC’ undeclared (first use in this function) arch/x86/mm/srat_64.c:141: error: (Each undeclared identifier is reported only once arch/x86/mm/srat_64.c:141: error: for each function it appears in.) A couple of UV definitions were moved to asm/uv/uv.h, but srat_64.c did not include that header. Add it. Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 09737c8af07..15df1baee10 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -21,6 +21,7 @@ #include #include #include +#include int acpi_numa __initdata; -- cgit v1.2.3 From fb746d0e1365b7472ccc4c3d5b0672b34a092d0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 21 Jan 2009 20:45:41 +0100 Subject: x86: optimise page fault entry, cleanup tsk is already assigned to current, drop the redundant second assignment. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 033292dc9e2..8f4b859a04b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -419,7 +419,6 @@ static noinline void pgtable_bad(struct pt_regs *regs, printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3 From d639bab8da86d330493487e8c0fea8ca31f53427 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:13 -0800 Subject: x86 PAT: ioremap_wc should take resource_size_t parameter Impact: fix/extend ioremap_wc() beyond 4GB aperture on 32-bit ioremap_wc() was taking in unsigned long parameter, where as it should take 64-bit resource_size_t parameter like other ioremap variants. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bd85d42819e..2ddb1e79a19 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache); * * Must be freed with iounmap. */ -void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, -- cgit v1.2.3 From fe40c0af3cff3ea461cf25bddb979abc7279d4df Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 23 Jan 2009 15:49:41 -0800 Subject: x86: uaccess: introduce try and catch framework Impact: introduce new uaccess exception handling framework Introduce {get|put}_user_try and {get|put}_user_catch as new uaccess exception handling framework. {get|put}_user_try begins exception block and {get|put}_user_catch(err) ends the block and gets err if an exception occured in {get|put}_user_ex() in the block. The exception is stored thread_info->uaccess_err. The example usage of this framework is below; int func() { int err = 0; get_user_try { get_user_ex(...); get_user_ex(...); : } get_user_catch(err); return err; } Note: get_user_ex() is not clear the value when an exception occurs, it's different from the behavior of __get_user(), but I think it doesn't matter. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/extable.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 7e8db53528a..61b41ca3b5a 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { + /* If fixup is less than 16, it means uaccess error */ + if (fixup->fixup < 16) { + current_thread_info()->uaccess_err = -EFAULT; + regs->ip += fixup->fixup; + return 1; + } regs->ip = fixup->fixup; return 1; } -- cgit v1.2.3 From 75a048119e76540d73132cfc8e0fa0c0a8bb6c83 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 22 Jan 2009 16:17:05 -0800 Subject: x86: handle PAT more like other CPU features Impact: Cleanup When PAT was originally introduced, it was handled specially for a few reasons: - PAT bugs are hard to track down, so we wanted to maintain a whitelist of CPUs. - The i386 and x86-64 CPUID code was not yet unified. Both of these are now obsolete, so handle PAT like any other features, including ordinary feature blacklisting due to known bugs. Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8b08fb95527..430cb44dd3f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -30,7 +30,7 @@ #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(char *reason) +void __cpuinit pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); @@ -42,6 +42,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} #endif @@ -78,16 +83,20 @@ void pat_init(void) if (!pat_enabled) return; - /* Paranoia check. */ - if (!cpu_has_pat && boot_pat_state) { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } } /* Set PWT to Write-Combining. All other bits stay the same */ -- cgit v1.2.3 From 6470aff619fbb9dff8dfe8afa5033084cd55ca20 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move 64-bit NUMA code Impact: Code movement, no functional change. Move the 64-bit NUMA code from setup_percpu.c to numa_64.c Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89..08d140fbc31 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,6 +20,12 @@ #include #include +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -33,6 +39,21 @@ int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +DEFINE_PER_CPU(int, node_number) = 0; +EXPORT_PER_CPU_SYMBOL(node_number); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* + * Which logical CPUs are on which nodes + */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -640,3 +661,199 @@ void __init init_cpu_to_node(void) #endif +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + per_cpu(node_number, cpu) = node; +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + + +/* empty cpumask */ +static const cpumask_t cpu_mask_none; + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const cpumask_t *cpumask_of_node(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return (const cpumask_t *)&cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return &cpu_mask_none; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +/* + * Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "node_to_cpumask(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_mask_none; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ -- cgit v1.2.3 From dac5f4121df3c39fdb2ea57acd669a0ae19e46f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 15:42:24 +0100 Subject: x86, apic: untangle the send_IPI_*() jungle Our send_IPI_*() methods and definitions are a twisted mess: the same symbol is defined to different things depending on .config details, in a non-transparent way. - spread out the quirks into separately named per apic driver methods - prefix the standard PC methods with default_ - get rid of wrapper macro obfuscation - clean up various details Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 72a6d4ebe34..6348e114692 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -196,7 +196,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(to_cpumask(f->flush_cpumask), + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), INVALIDATE_TLB_VECTOR_START + sender); while (!cpumask_empty(to_cpumask(f->flush_cpumask))) -- cgit v1.2.3 From d53e2f2855f1c7c2725d550c1ae6b26f4d671c50 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 19:14:52 +0100 Subject: x86, smp: remove mach_ipi.h Move mach_ipi.h definitions into genapic.h. Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6348e114692..14c5af4d11e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From 3e5095d15276efd14a45393666b1bb7536bf179f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jan 2009 17:07:08 +0100 Subject: x86: replace CONFIG_X86_SMP with CONFIG_SMP The x86/Voyager subarch used to have this distinction between 'x86 SMP support' and 'Voyager SMP support': config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) This is a pointless distinction - Voyager can (and already does) use smp_ops to implement various SMP quirks it has - and it can be extended more to cover all the specialities of Voyager. So remove this complication in the Kconfig space. Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9f05157220f..2b938a38491 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,7 +1,7 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o -obj-$(CONFIG_X86_SMP) += tlb.o +obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o -- cgit v1.2.3 From 010060741ad35eacb504414bc6fb9bb575b15f62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2009 16:02:12 +0100 Subject: x86: add might_sleep() to do_page_fault() Impact: widen debug checks VirtualBox calls do_page_fault() from an atomic context but runs into a might_sleep() way pas this point, cure that. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f4b859a04b..eb4d7fe0593 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -888,6 +888,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). + */ + might_sleep(); } vma = find_vma(mm, address); -- cgit v1.2.3 From 8f47e16348e8e25eedf639092a8a2f10a66aba34 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 31 Jan 2009 02:03:42 +0100 Subject: x86: update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 56fe7124fbe..16582960056 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -4,7 +4,7 @@ * Based on code by Ingo Molnar and Andi Kleen, copyrighted * as follows: * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2003-2009 Red Hat Inc. * All Rights Reserved. * Copyright 2005 Andi Kleen, SUSE Labs. * Copyright 2007 Jiri Kosina, SUSE Labs. -- cgit v1.2.3 From 0973a06cde8cc1522fbcf2baacb926f1ee3f4c79 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Feb 2009 15:24:09 -0800 Subject: x86: mm: introduce helper function in fault.c Impact: cleanup Introduce helper function fault_in_kernel_address() to make editors happy. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eb4d7fe0593..8e9b0f1fd87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -775,6 +775,15 @@ static inline int access_error(unsigned long error_code, int write, return 0; } +static int fault_in_kernel_space(unsigned long address) +{ +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -817,11 +826,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; -- cgit v1.2.3 From 44581a28e805a31661469c4b466b9cd14b36e7b6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 8 Feb 2009 09:58:40 -0500 Subject: x86: fix abuse of per_cpu_offset Impact: bug fix Don't use per_cpu_offset() to determine if it valid to access a per-cpu variable for a given cpu number. It is not a valid assumption on x86-64 anymore. Use cpu_possible() instead. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 08d140fbc31..deb1c1ab786 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -702,7 +702,7 @@ void __cpuinit numa_set_node(int cpu, int node) } #ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; @@ -790,7 +790,7 @@ int early_cpu_to_node(int cpu) if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - if (!per_cpu_offset(cpu)) { + if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); -- cgit v1.2.3 From 7651194fb715b2d57658c05a710408f6b8448951 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 11 Feb 2009 22:26:52 +0530 Subject: x86: mm/init_32.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/mm/init_32.c: In function ‘find_low_pfn_range’: arch/x86/mm/init_32.c:696: warning: format ‘%u’ expects type ‘unsigned int’, but Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef0507441..d48f2560364 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -692,7 +692,7 @@ void __init find_low_pfn_range(void) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %uMB.\n", + "available, ignoring highmem size of %luMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3 From 3023533de43c5c01c660e1b48d3700b028eb4615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:28:43 +0100 Subject: x86: fix warning in find_low_pfn_range() Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d48f2560364..e77459dd38a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -722,7 +722,7 @@ void __init find_low_pfn_range(void) highmem_pages = 0; #ifdef CONFIG_HIGHMEM if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is " + printk(KERN_ERR "highmem size specified (%luMB) is " "bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); @@ -731,7 +731,7 @@ void __init find_low_pfn_range(void) if (highmem_pages) { if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in " + printk(KERN_ERR "highmem size %luMB results in " "smaller than 64MB lowmem, ignoring it.\n" , pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3 From 4769843bc265a9c24584b98709cf39e1df5c1404 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:31:41 +0100 Subject: x86, 32-bit: clean up find_low_pfn_range() Impact: cleanup Split find_low_pfn_range() into two functions: - lowmem_pfn_init() - highmem_pfn_init() The former gets called if all of RAM fits into lowmem, otherwise we call highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 137 +++++++++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 58 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e77459dd38a..9d36eb9ebd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -675,75 +675,96 @@ static int __init parse_highmem(char *arg) } early_param("highmem", parse_highmem); +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" /* - * Determine low and high memory ranges: + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: */ -void __init find_low_pfn_range(void) +void __init lowmem_pfn_init(void) { - /* it could update max_pfn */ - - /* max_low_pfn is 0, we already have early_res support */ - - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %luMB.\n", - pages_to_mb(max_pfn - MAXMEM_PFN), + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING - "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_HIGHMEM64G - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used." - "Use a HIGHMEM64G enabled kernel.\n"); - } + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%luMB) is " - "bigger than pages available (%luMB)!.\n", - pages_to_mb(highmem_pages), - pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn - highmem_pages < - 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %luMB results in " - "smaller than 64MB lowmem, ignoring it.\n" - , pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem" - " kernel!\n"); -#endif - } +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (max_low_pfn > MAXMEM_PFN) + highmem_pfn_init(); + else + lowmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3 From d88316c243e5458a1888edbe0353c4dec6e61c73 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 15:16:03 +0100 Subject: x86, 32-bit: refactor find_low_pfn_range() Impact: cleanup Make the max_low_pfn logic a bit more standard between lowmem_pfn_init() and highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9d36eb9ebd5..1a9612499a3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -687,6 +687,9 @@ early_param("highmem", parse_highmem); */ void __init lowmem_pfn_init(void) { + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + if (highmem_pages == -1) highmem_pages = 0; #ifdef CONFIG_HIGHMEM @@ -720,6 +723,8 @@ void __init lowmem_pfn_init(void) */ void __init highmem_pfn_init(void) { + max_low_pfn = MAXMEM_PFN; + if (highmem_pages == -1) highmem_pages = max_pfn - MAXMEM_PFN; @@ -732,7 +737,6 @@ void __init highmem_pfn_init(void) pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); @@ -758,13 +762,10 @@ void __init find_low_pfn_range(void) { /* it could update max_pfn */ - /* max_low_pfn is 0, we already have early_res support */ - max_low_pfn = max_pfn; - - if (max_low_pfn > MAXMEM_PFN) - highmem_pfn_init(); - else + if (max_pfn <= MAXMEM_PFN) lowmem_pfn_init(); + else + highmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3 From 7b6aa335ca1a845c2262ec7a595b4521bca0f79d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 13:58:15 +0100 Subject: x86, apic: remove genapic.h Impact: cleanup Remove genapic.h and remove all references to it. Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 2 +- arch/x86/mm/tlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 15df1baee10..574c8bc95ef 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include int acpi_numa __initdata; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14c5af4d11e..b641349fe07 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From e641f5f525acb163ba71d92de79c9c7366deae03 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 14:02:01 +0100 Subject: x86, apic: remove duplicate asm/apic.h inclusions Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b641349fe07..a654d59e448 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/mm/init_32.c | 10 ++++++++++ arch/x86/mm/init_64.c | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00263bf07a8..8b1a0ef7f87 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,6 +137,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +void __init populate_extra_pte(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + pmd_t *pmd; + + pmd = one_md_table_init(swapper_pg_dir + pgd_idx); + one_page_table_init(pmd + pmd_idx); +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..7f91e2cdc4c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,25 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +void __init populate_extra_pte(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) { + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + return; + } + } + + set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); +} + /* * Create large page table mappings for a range of physical addresses. */ -- cgit v1.2.3 From 3c3e5694add02e665bbbd0fecfbbdcc0b903097a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 11:46:36 -0500 Subject: x86: check PMD in spurious_fault handler Impact: fix to prevent hard lockup on bad PMD permissions If the PMD does not have the correct permissions for a page access, but the PTE does, the spurious fault handler will mistake the fault as a lazy TLB transaction. This will result in an infinite loop of: fault -> spurious_fault check (pass) -> return to code -> fault This patch adds a check and a warn on if the PTE passes the permissions but the PMD does not. [ Updated: Ingo Molnar suggested using WARN_ONCE with some text ] Signed-off-by: Steven Rostedt --- arch/x86/mm/fault.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c76ef1d701c..278d645d108 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -455,6 +455,7 @@ static int spurious_fault(unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -482,7 +483,17 @@ static int spurious_fault(unsigned long address, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD + * If not, then there's a bug in the page tables. + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* -- cgit v1.2.3 From 7a5714e0186030676d79a7b4b9830c8e45c3b0a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 17:44:21 +0100 Subject: x86, pat: add large-PAT check to split_large_page() Impact: future-proof the split_large_page() function Linus noticed that split_large_page() is not safe wrt. the PAT bit: it is bit 12 on the 1GB and 2MB page table level (_PAGE_BIT_PAT_LARGE), and it is bit 7 on the 4K page table level (_PAGE_BIT_PAT). Currently it is not a problem because we never set _PAGE_BIT_PAT_LARGE on any of the large-page mappings - but should this happen in the future the split_large_page() would silently lift bit 12 into the lowlevel 4K pte and would start corrupting the physical page frame offset. Not fun. So add a debug warning, to make sure if something ever sets the PAT bit then this function gets updated too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7be47d1a97e..8253bc97587 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { -- cgit v1.2.3 From 2d4a71676f4d89418a0d53e60b89e8b804b390b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 19:56:40 +0100 Subject: x86, mm: fault.c cleanup Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 546 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 331 insertions(+), 215 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4b9fc5001c..351d679bf97 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,56 +1,59 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { @@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs) } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 64-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { + unsigned char *max_instr; unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return 0; while (scan_more && instr < max_instr) { - unsigned char opcode; unsigned char instr_hi; unsigned char instr_lo; + unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; @@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } @@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code, static int bad_address(void *p) { unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } #endif @@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address) page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } @@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: @@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ - pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) @@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) return NULL; + if (!pmd_present(*pmd)) { set_pmd(pmd, *pmd_k); arch_flush_lazy_mmu_mode(); - } else + } else { BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + return pmd_k; } #endif @@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_32 if (!oops_may_print()) @@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, #ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) + if (pte && pte_present(*pte) && !pte_exec(*pte)) { printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " "(uid: %d)\n", current_uid()); + } } #endif @@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } #endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs, int sig; #endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ #ifdef CONFIG_X86_32 bust_spinlocks(1); @@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); @@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs, sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); #endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); + #ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + /* User space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; #endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code, return 0; /* - * Make sure we have permissions in PMD - * If not, then there's a bug in the page tables. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* - * X86_32 - * Handle a fault on the vmalloc or module mapping area + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area * - * X86_64 - * Handle a fault on the vmalloc area + * 64-bit: + * + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd_k; pte_t *pte_k; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; @@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; + return 0; #else pgd_t *pgd, *pgd_ref; @@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - /* Below here mismatches are bugs because these lower tables - are shared */ + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; + pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); + return 0; #endif } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } @@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address) { #ifdef CONFIG_X86_32 return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ +#else return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ +#endif } /* @@ -812,18 +918,19 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) @@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -965,13 +1077,15 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else @@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { + unsigned long flags; struct page *page; spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) + if (!vmalloc_sync_one(page_address(page), address)) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void) #else /* CONFIG_X86_64 */ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; -- cgit v1.2.3 From 107a03678cac0dd6cf7095f81442a4fa477e4700 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 20:37:05 +0100 Subject: x86, mm: fault.c, refactor/simplify the is_prefetch() code Impact: no functionality changed Factor out the opcode checker into a helper inline. The code got a tiny bit smaller: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4618 32 24 4674 1242 fault.o.after And it got cleaner / easier to review as well. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 351d679bf97..72973c7682b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,12 +99,58 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner. */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; /* @@ -114,68 +160,22 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { - unsigned char instr_hi; - unsigned char instr_lo; + while (instr < max_instr) { unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } } return prefetch; } -- cgit v1.2.3 From 8c938f9fae887f6e180bf802aa1c33cf74712aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:12:18 +0100 Subject: x86, mm: fault.c, factor out the vm86 fault check Impact: cleanup Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check in do_page_fault(), put it into the check_v8086_mode() helper function and merge it with an existing #ifdef. Also, simplify the code flow a tiny bit in the helper. No code changed: arch/x86/mm/fault.o: text data bss dec hex filename 2711 12 12 2735 aaf fault.o.before 2711 12 12 2735 aaf fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 72973c7682b..7dc0615c3cf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -#endif -#ifdef CONFIG_X86_64 +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +#else /* CONFIG_X86_64: */ + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + #endif /* @@ -1091,16 +1118,8 @@ good_area: else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif + check_v8086_mode(regs, address, tsk); + up_read(&mm->mmap_sem); } -- cgit v1.2.3 From 121d5d0a7e5808fbcfda484efd7ba840ac93450f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:18:08 +0100 Subject: x86, mm: fault.c, enable PF_RSVD checks on 32-bit too Impact: improve page fault handling robustness The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a relatively recent addition to x86 CPUs, so the 32-bit do_fault() implementation never had it. This flag gets set when the CPU detects nonzero values in any reserved bits of the page directory entries. Extend the existing 64-bit check for PF_RSVD in do_page_fault() to 32-bit too. If we detect such a fault then we print a more informative oops and the pagetables. This unifies the code some more, removes an ugly #ifdef and improves the 32-bit page fault code robustness a bit. It slightly increases the 32-bit kernel text size. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7dc0615c3cf..3e366146273 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); } -#ifdef CONFIG_X86_64 static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -#endif static noinline void no_context(struct pt_regs *regs, unsigned long error_code, @@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* * If we're in an interrupt, have no user context or are running -- cgit v1.2.3 From b814d41f0987c7648d7ed07471258101c95c026b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:32:10 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault() Impact: cleanup Remove an #ifdef from kmmio_fault() - we can do this by providing default implementations for is_kmmio_active() and kmmio_handler(). The compiler optimizes it all away in the !CONFIG_MMIOTRACE case. Also, while at it, clean up mmiotrace.h a bit: - standard header guards - standard vertical spaces for structure definitions No code changed (both with mmiotrace on and off in the config): text data bss dec hex filename 2947 12 12 2971 b9b fault.o.before 2947 12 12 2971 b9b fault.o.after Cc: Pekka Paalanen Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3e366146273..fe99af4b86d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,13 +55,14 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +/* + * (returns 0 if mmiotrace is disabled) + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } -- cgit v1.2.3 From b18018126f422f5b706fd750373425e10e84b486 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:42:57 +0100 Subject: x86, mm, kprobes: fault.c, simplify notify_page_fault() Impact: cleanup Remove an #ifdef from notify_page_fault(). The function still compiles to nothing in the !CONFIG_KPROBES case. Introduce kprobes_built_in() and kprobe_fault_handler() helpers to allow this - they returns 0 if !CONFIG_KPROBES. No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fe99af4b86d..379beaec6ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* -- cgit v1.2.3 From f2f13a8535174dbb813a0607a9d4737cfba98f6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:50:24 +0100 Subject: x86, mm: fault.c, reorder functions Impact: cleanup Avoid a couple more #ifdefs by moving fundamentally non-unifiable functions into a single #ifdef 32-bit / #else / #endif block in fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode(). No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 474 ++++++++++++++++++++++++++-------------------------- 1 file changed, 239 insertions(+), 235 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 379beaec6ca..4ce62fb80da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - return probe_kernel_address((unsigned long *)p, dummy); + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); @@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address) } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -284,83 +515,9 @@ out: return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; -} - -#else /* CONFIG_X86_64: */ - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ } -#endif +#endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. @@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - * - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -#endif -} - int show_unhandled_signals = 1; static inline int @@ -1115,53 +1169,3 @@ good_area: up_read(&mm->mmap_sem); } - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif -} -- cgit v1.2.3 From 8f7661496cece8320137d5e26808825498fd2b26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:00:29 +0100 Subject: x86, mm: fault.c, unify oops printing Impact: refine/extend page fault related oops printing on 64-bit - honor the pause_on_oops logic on 64-bit too - print out NX fault warnings on 64-bit as well - factor out the NX fault message to make it git-greppable and readable Note that this means that we do the PF_INSTR check on 32-bit non-PAE as well where it should not occur ... normally. Cannot hurt. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4ce62fb80da..ebfaca3bbb1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) { - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); - } + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) -- cgit v1.2.3 From 1cc99544dde9e48602979f16b9309fade6e93051 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:07:48 +0100 Subject: x86, mm: fault.c, unify oops handling Impact: add oops-recursion check to 32-bit Unify the oops state-machine, to the 64-bit version. It is slightly more careful in that it does a recursion check in oops_begin(), and is thus more likely to show the relevant oops. It also means that 32-bit will print one more line at the end of pagefault triggered oopses: printk(KERN_EMERG "CR2: %016lx\n", address); Which is generally good information to be seen in partial-dump digital-camera jpegs ;-) The downside is the somewhat more complex critical path. Both variants have been tested well meanwhile by kernel developers crashing their boxes so i dont think this is a practical worry. This removes 3 ugly #ifdefs from no_context() and makes the function a lot nicer read. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ebfaca3bbb1..8fe2dd254df 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) @@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); @@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "CR2: %016lx\n", address); oops_end(flags, regs, sig); -#endif } /* -- cgit v1.2.3 From c3731c68668325abddee8665018c74c7156a57be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:22:34 +0100 Subject: x86, mm: fault.c, remove #ifdef from do_page_fault() Impact: cleanup do_page_fault() has this ugly #ifdef in its prototype: #ifdef CONFIG_X86_64 asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) Replace it with 'dotraplinkage' which maps to exactly the above construct: nothing on 32-bit and asmlinkage on 64-bit. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8fe2dd254df..9c2dc5d7953 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; -- cgit v1.2.3 From d951734654f76a370a89b4e531af9b765bd13541 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:32:28 +0100 Subject: x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX Impact: cleanup Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the define on 32-bit too. (mapped to TASK_SIZE) This allows 32-bit code to make use of the (former-) TASK_SIZE64 symbol as well, in a clean way. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c2dc5d7953..6fa9f175cba 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address) #ifdef CONFIG_X86_32 return address >= TASK_SIZE; #else - return address >= TASK_SIZE64; + return address >= TASK_SIZE_MAX; #endif } -- cgit v1.2.3 From 7c178a26d3e753d2a4346d3e4b8aa549d387f698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:27:29 +0100 Subject: x86, mm: fault.c, remove #ifdef from fault_in_kernel_space() Impact: cleanup Removal of an #ifdef in fault_in_kernel_space(), by making use of the new TASK_SIZE_MAX symbol which is now available on 32-bit too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6fa9f175cba..f195691ec26 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else return address >= TASK_SIZE_MAX; -#endif } /* -- cgit v1.2.3 From cd1b68f08f6328737928e5b8ba8ef80394680ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:39:02 +0100 Subject: x86, mm: fault.c, give another attempt at prefetch handing before SIGBUS Impact: extend prefetch handling on 64-bit Currently there's an extra is_prefetch() check done in do_sigbus(), which we only do on 32 bits. This is a last-ditch check before we terminate a task, so it's worth giving prefetch instructions another chance - should none of our existing quirks have caught a prefetch instruction related spurious fault. The only risk is if a prefetch causes a real sigbus, in that case we'll not OOM but try another fault. But this code has been on 32-bit for a long time, so it should be fine in practice. So do this on 64-bit too - and thus remove one more #ifdef. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f195691ec26..413e835e4a8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,11 +836,9 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault: */ + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif tsk->thread.cr2 = address; tsk->thread.error_code = error_code; -- cgit v1.2.3 From f8eeb2e6be367d79be3617f0a12646bced8b2fe6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:13:36 +0100 Subject: x86, mm: fault.c, update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 413e835e4a8..9bb07d331c3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include #include -- cgit v1.2.3 From 2366c298b5afe52e635afd5604b69ce9fd4471fc Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 22 Feb 2009 01:01:13 +0100 Subject: x86: numa_32.c: fix sparse warning: Using plain integer as NULL pointer Fix this sparse warning: arch/x86/mm/numa_32.c:197:24: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..3957cd6d645 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; + return NULL; node_remap_alloc_vaddr[nid] += size; memset(allocation, 0, size); -- cgit v1.2.3 From b319eed0aa0a6d710887350a3cb734c572aa64c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 10:24:18 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault(), cleanup Clarify the kmmio_fault() comment. Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9bb07d331c3..a03b7279efa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -57,7 +57,8 @@ enum x86_pf_error_code { }; /* - * (returns 0 if mmiotrace is disabled) + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -- cgit v1.2.3 From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/mm/init_32.c | 13 ++++++--- arch/x86/mm/init_64.c | 75 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 55 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8b1a0ef7f87..84a26883ab4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,14 +137,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { int pgd_idx = pgd_index(vaddr); int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); pmd_t *pmd; - pmd = one_md_table_init(swapper_pg_dir + pgd_idx); - one_page_table_init(pmd + pmd_idx); + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; } static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7f91e2cdc4c..7d4e76da336 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,34 +168,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,23 +239,22 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) { - printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); - return; - } - } + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; - set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); } /* -- cgit v1.2.3 From 40823f737e5bd186a1156fb1c28f360260e1e084 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:26:18 +0100 Subject: x86: memtest: reuse test patterns when memtest parameter exceeds number of available patterns Impact: fix unexpected behaviour when pattern number is out of range Current implementation provides 4 patterns for memtest. The code doesn't check whether the memtest parameter value exceeds the maximum pattern number. Instead the memtest code pretends to test with non-existing patterns, e.g. when booting with memtest=10 I've observed the following ... early_memtest: pattern num 10 0000001000 - 0000006000 pattern 0 ... 0000001000 - 0000006000 pattern 1 ... 0000001000 - 0000006000 pattern 2 ... 0000001000 - 0000006000 pattern 3 ... 0000001000 - 0000006000 pattern 4 ... 0000001000 - 0000006000 pattern 5 ... 0000001000 - 0000006000 pattern 6 ... 0000001000 - 0000006000 pattern 7 ... 0000001000 - 0000006000 pattern 8 ... 0000001000 - 0000006000 pattern 9 ... But in fact Linux didn't test anything for patterns > 4 as the default case in memtest() is to leave the function. I suggest to use the memtest parameter as the number of tests to be performed and to re-iterate over all existing patterns. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9cab18b0b85..00b8bdc64c3 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,8 @@ #include +#define _MAX_MEM_PATTERNS 4 + static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern) { @@ -21,6 +23,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, unsigned long count; unsigned long incr; + pattern = pattern % _MAX_MEM_PATTERNS; + switch (pattern) { case 0: val = 0UL; @@ -110,8 +114,9 @@ void __init early_memtest(unsigned long start, unsigned long end) t_size = end - t_start; printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, pattern); + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, + pattern % _MAX_MEM_PATTERNS); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 6d74171bf7315257d276aa35400c5a8d6a993f19 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:27:27 +0100 Subject: x86: memtest: introduce array to select memtest patterns Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 65 ++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 00b8bdc64c3..827f94044cf 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,48 +9,25 @@ #include -#define _MAX_MEM_PATTERNS 4 +static u64 patterns[] __initdata = { + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, +}; static void __init memtest(unsigned long start_phys, unsigned long size, - unsigned pattern) + u64 pattern) { unsigned long i; - unsigned long *start; + u64 *start; unsigned long start_bad; unsigned long last_bad; - unsigned long val; unsigned long start_phys_aligned; unsigned long count; unsigned long incr; - pattern = pattern % _MAX_MEM_PATTERNS; - - switch (pattern) { - case 0: - val = 0UL; - break; - case 1: - val = -1UL; - break; - case 2: -#ifdef CONFIG_X86_64 - val = 0x5555555555555555UL; -#else - val = 0x55555555UL; -#endif - break; - case 3: -#ifdef CONFIG_X86_64 - val = 0xaaaaaaaaaaaaaaaaUL; -#else - val = 0xaaaaaaaaUL; -#endif - break; - default: - return; - } - - incr = sizeof(unsigned long); + incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); @@ -58,15 +35,16 @@ static void __init memtest(unsigned long start_phys, unsigned long size, last_bad = 0; for (i = 0; i < count; i++) - start[i] = val; + start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != val) { + if (*start != pattern) { if (start_phys_aligned == last_bad + incr) { last_bad += incr; } else { if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, + start_bad, last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } start_bad = last_bad = start_phys_aligned; @@ -74,8 +52,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size, } } if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, + last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } } @@ -95,13 +74,16 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { u64 t_start, t_size; - unsigned pattern; + unsigned int i; + u64 pattern; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); - for (pattern = 0; pattern < memtest_pattern; pattern++) { + for (i = 0; i < memtest_pattern; i++) { + unsigned int idx = i % ARRAY_SIZE(patterns); + pattern = patterns[idx]; t_start = start; t_size = 0; while (t_start < end) { @@ -115,8 +97,7 @@ void __init early_memtest(unsigned long start, unsigned long end) printk(KERN_CONT "\n %010llx - %010llx pattern %d", (unsigned long long)t_start, - (unsigned long long)t_start + t_size, - pattern % _MAX_MEM_PATTERNS); + (unsigned long long)t_start + t_size, idx); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 7dad169e57eda1f0aa6dc5ac43a898b4b0ced2c7 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:07 +0100 Subject: x86: memtest: cleanup memtest function Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 827f94044cf..01a72d6825b 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,6 +16,15 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; +static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, + unsigned long end_bad) +{ + printk(KERN_CONT "\n %016llx bad mem addr " + "%010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, end_bad); + reserve_early(start_bad, end_bad, "BAD RAM"); +} + static void __init memtest(unsigned long start_phys, unsigned long size, u64 pattern) { @@ -37,26 +46,18 @@ static void __init memtest(unsigned long start_phys, unsigned long size, for (i = 0; i < count; i++) start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != pattern) { - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - } else { - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, - start_bad, last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } - start_bad = last_bad = start_phys_aligned; - } + if (*start == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; } - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, - last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); } /* default is disabled */ -- cgit v1.2.3 From 570c9e69aaa84689fb8ed2a3a4af39ca54ba7a47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:58 +0100 Subject: x86: memtest: adapt log messages - print test pattern instead of pattern number, - show pattern as stored in memory, - use proper priority flags, - consistent use of u64 throughout the code Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 01a72d6825b..3232397783a 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,25 +16,22 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; -static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, - unsigned long end_bad) +static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) { - printk(KERN_CONT "\n %016llx bad mem addr " - "%010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, end_bad); + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); reserve_early(start_bad, end_bad, "BAD RAM"); } -static void __init memtest(unsigned long start_phys, unsigned long size, - u64 pattern) +static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - unsigned long i; + u64 i, count; u64 *start; - unsigned long start_bad; - unsigned long last_bad; - unsigned long start_phys_aligned; - unsigned long count; - unsigned long incr; + u64 start_bad, last_bad; + u64 start_phys_aligned; + size_t incr; incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); @@ -81,7 +78,7 @@ void __init early_memtest(unsigned long start, unsigned long end) if (!memtest_pattern) return; - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { unsigned int idx = i % ARRAY_SIZE(patterns); pattern = patterns[idx]; @@ -96,14 +93,13 @@ void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, idx); - - memtest(t_start, t_size, pattern); + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) t_start, + (unsigned long long) t_start + t_size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, t_start, t_size); t_start += t_size; } } - printk(KERN_CONT "\n"); } -- cgit v1.2.3 From bfb4dc0da45f8fddc76eba7e62919420c7d6dae2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:30:04 +0100 Subject: x86: memtest: wipe out test pattern from memory Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 56 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 3232397783a..9c52ef19e6f 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -57,6 +57,29 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) reserve_bad_mem(pattern, start_bad, last_bad + incr); } +static void __init do_one_pass(u64 pattern, u64 start, u64 end) +{ + u64 size = 0; + + while (start < end) { + start = find_e820_area_size(start, &size, 1); + + /* done ? */ + if (start >= end) + break; + if (start + size > end) + size = end - start; + + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) start, + (unsigned long long) start + size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, start, size); + + start += size; + } +} + /* default is disabled */ static int memtest_pattern __initdata; @@ -71,35 +94,22 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { - u64 t_start, t_size; unsigned int i; - u64 pattern; + unsigned int idx = 0; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { - unsigned int idx = i % ARRAY_SIZE(patterns); - pattern = patterns[idx]; - t_start = start; - t_size = 0; - while (t_start < end) { - t_start = find_e820_area_size(t_start, &t_size, 1); - - /* done ? */ - if (t_start >= end) - break; - if (t_start + t_size > end) - t_size = end - t_start; - - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long) t_start, - (unsigned long long) t_start + t_size, - (unsigned long long) cpu_to_be64(pattern)); - memtest(pattern, t_start, t_size); - - t_start += t_size; - } + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } + + if (idx > 0) { + printk(KERN_INFO "early_memtest: wipe out " + "test pattern from memory\n"); + /* additional test with pattern 0 will do this */ + do_one_pass(0, start, end); } } -- cgit v1.2.3 From 63823126c221dd721ce7351b596b3b73aa943613 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:31:49 +0100 Subject: x86: memtest: add additional (regular) test patterns Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9c52ef19e6f..0bcd7883d03 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -14,6 +14,19 @@ static u64 patterns[] __initdata = { 0xffffffffffffffffULL, 0x5555555555555555ULL, 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ }; static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) -- cgit v1.2.3 From 7880f7464546842ee14179bef16a6e14381ea638 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:13 -0800 Subject: gpu/drm, x86, PAT: routine to keep identity map in sync Add a function to check and keep identity maps in sync, when changing any memory type. One of the follow on patches will also use this routine. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index aebbf67a79d..399383c6f61 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -624,6 +624,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +{ + unsigned long id_sz; + + if (!pat_enabled || base >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + printk(KERN_INFO + "%s:%d ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + base, (unsigned long long)(base + size)); + return -EINVAL; + } + return 0; +} + /* * Internal interface to reserve a range of physical memory with prot. * Reserved non RAM regions only and after successful reserve_memtype, @@ -633,7 +660,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; - int id_sz, ret; + int ret; unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); @@ -670,23 +697,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags); } - /* Need to keep identity mapping in sync */ - if (paddr >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < paddr + size) ? - __pa(high_memory) - paddr : - size; - - if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { free_memtype(paddr, paddr + size); - printk(KERN_ERR - "%s:%d reserve_pfn_range ioremap_change_attr failed %s " - "for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size)); return -EINVAL; } return 0; -- cgit v1.2.3 From 17581ad812a9abb0182260374ef2e52d4a808a64 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:14 -0800 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_* Make io_mapping_create_wc and io_mapping_free go through PAT to make sure that there are no memory type aliases. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 6c2b1af1692..94596f794b1 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,6 +38,44 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif +int +reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) +{ + unsigned long ret_flag; + + if (!is_io_mapping_possible(base, size)) + goto out_err; + + if (!pat_enabled) { + *prot = pgprot_noncached(PAGE_KERNEL); + return 0; + } + + if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) + goto out_err; + + if (ret_flag == _PAGE_CACHE_WB) + goto out_free; + + if (kernel_map_sync_memtype(base, size, ret_flag)) + goto out_free; + + *prot = __pgprot(__PAGE_KERNEL | ret_flag); + return 0; + +out_free: + free_memtype(base, base + size); +out_err: + return -EINVAL; +} + +void +free_io_memtype(u64 base, unsigned long size) +{ + if (pat_enabled) + free_memtype(base, base + size); +} + /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * -- cgit v1.2.3 From 13093cb0e59053bf97910de3a24f07cdff71c62c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 03:16:47 +0100 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_*, export symbols for modules Impact: build fix ERROR: "reserve_io_memtype_wc" [drivers/gpu/drm/i915/i915.ko] undefined! ERROR: "free_io_memtype" [drivers/gpu/drm/i915/i915.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 94596f794b1..d5e28424622 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -68,6 +68,7 @@ out_free: out_err: return -EINVAL; } +EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); void free_io_memtype(u64 base, unsigned long size) @@ -75,6 +76,7 @@ free_io_memtype(u64 base, unsigned long size) if (pat_enabled) free_memtype(base, base + size); } +EXPORT_SYMBOL_GPL(free_io_memtype); /* Map 'pfn' using fixed map 'type' and protections 'prot' */ -- cgit v1.2.3 From fd862dde18c3e360f510780e1d1bf615866b11c2 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Sun, 15 Feb 2009 21:48:54 -0300 Subject: x86, fixmap: define reserve_top_address for x86_64 Impact: new interface (not yet use) Define reserve_top_address for x86_64; only for later x86 integration. Signed-off-by: Gustavo F. Padovan Acked-by: Glauber Costa Signed-off-by: H. Peter Anvin --- arch/x86/mm/pgtable.c | 18 ++++++++++++++++++ arch/x86/mm/pgtable_32.c | 16 ---------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 86f2ffc43c3..5b7c7c8464f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; + __VMALLOC_RESERVE += reserve; +#endif +} + int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..c3cf6e11576 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -97,22 +97,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -/** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve - * - * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. - */ -void __init reserve_top_address(unsigned long reserve) -{ - BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; -} - /* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the -- cgit v1.2.3 From f5c1aa1537be39d8b9bb5279b5881d81898fd3cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 1 Mar 2009 12:32:08 +0100 Subject: Revert "gpu/drm, x86, PAT: PAT support for io_mapping_*" This reverts commit 17581ad812a9abb0182260374ef2e52d4a808a64. Sitsofe Wheeler reported that /dev/dri/card0 is MIA on his EeePC 900 and bisected it to this commit. Graphics card is an i915 in an EeePC 900: 00:02.0 VGA compatible controller [0300]: Intel Corporation Mobile 915GM/GMS/910GML Express Graphics Controller [8086:2592] (rev 04) ( Most likely the ioremap() of the driver failed and hence the card did not initialize. ) Reported-by: Sitsofe Wheeler Bisected-by: Sitsofe Wheeler Cc: Venkatesh Pallipadi Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 44 ++------------------------------------------ 1 file changed, 2 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index d5e28424622..6c2b1af1692 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,46 +38,6 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif -int -reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) -{ - unsigned long ret_flag; - - if (!is_io_mapping_possible(base, size)) - goto out_err; - - if (!pat_enabled) { - *prot = pgprot_noncached(PAGE_KERNEL); - return 0; - } - - if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) - goto out_err; - - if (ret_flag == _PAGE_CACHE_WB) - goto out_free; - - if (kernel_map_sync_memtype(base, size, ret_flag)) - goto out_free; - - *prot = __pgprot(__PAGE_KERNEL | ret_flag); - return 0; - -out_free: - free_memtype(base, base + size); -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); - -void -free_io_memtype(u64 base, unsigned long size) -{ - if (pat_enabled) - free_memtype(base, base + size); -} -EXPORT_SYMBOL_GPL(free_io_memtype); - /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * -- cgit v1.2.3 From 2b688dfd0a93cf3b17c38feef693361da47b0606 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:04 +0200 Subject: x86: move __VMALLOC_RESERVE to pgtable_32.c Impact: cleanup The __VMALLOC_RESERVE global variable is not used in init_32.c. Move that to pgtable_32.c to reduce the diff between init_32.c and init_64.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236077704.2675.4.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/pgtable_32.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 06708ee94aa..5b06a2f5dea 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,8 +50,6 @@ #include #include -unsigned int __VMALLOC_RESERVE = 128 << 20; - unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..e4032c886c3 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,6 +20,8 @@ #include #include +unsigned int __VMALLOC_RESERVE = 128 << 20; + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. -- cgit v1.2.3 From fd578f9c0a0a7bf3e460e6f21cdc6f4018949e80 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:05 +0200 Subject: x86: use roundup() instead of PAGE_ALIGN() in find_early_table_space() Impact: cleanup This patch changes find_early_table_space() to use roundup() for rounding up tables to page size to unify the common parts of the 32-bit and 64-bit implementations. Signed-off-by: Pekka Enberg LKML-Reference: <1236077705.2675.6.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5b06a2f5dea..1dd6b6334dc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -845,10 +845,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = PAGE_ALIGN(puds * sizeof(pud_t)); + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { unsigned long extra; @@ -859,10 +859,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* for fixmap */ - tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.2.3 From 05f209e7b936a48e341d36831079116a06658ccc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:02 +0200 Subject: x86: add sanity checks to init_32.c Impact: unification This patch adds sanity checks that are already in init_64.c to init_32.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236078902.2675.16.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1dd6b6334dc..1570a822c18 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1214,18 +1214,21 @@ void mark_rodata_ro(void) void free_init_pages(char *what, unsigned long begin, unsigned long end) { -#ifdef CONFIG_DEBUG_PAGEALLOC + unsigned long addr = begin; + + if (addr >= end) + return; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ +#ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else - unsigned long addr; - /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that @@ -1233,14 +1236,16 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); #endif } -- cgit v1.2.3 From e087edd8c056292191bb989baf49f83ee509e624 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:04 +0200 Subject: x86: make sure initmem is writable on 64-bit Impact: unification This patch ports commit 3c1df68b848b39270752ff8d4b956cc4a4dce0f6 ("x86: make sure initmem is writable") to the 64-bit version to unify implementations of free_init_pages(). Signed-off-by: Pekka Enberg Cc: Arjan van de Ven LKML-Reference: <1236078904.2675.17.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..03da9030d0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -962,6 +962,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (; addr < end; addr += PAGE_SIZE) { -- cgit v1.2.3 From e5b2bb552706ca0e30795ee84caacbb37cec5705 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:06 +0200 Subject: x86: unify free_init_pages() and free_initmem() Impact: unification This patch introduces a common arch/x86/mm/init.c and moves the identical free_init_pages() and free_initmem() functions to the file. Signed-off-by: Pekka Enberg LKML-Reference: <1236078906.2675.18.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/init.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 44 -------------------------------------------- arch/x86/mm/init_64.c | 44 -------------------------------------------- 4 files changed, 50 insertions(+), 89 deletions(-) create mode 100644 arch/x86/mm/init.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2b938a38491..08537747cb5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,4 +1,4 @@ -obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 00000000000..ce6a722587d --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr = begin; + + if (addr >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1570a822c18..cd8d6732613 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1212,50 +1212,6 @@ void mark_rodata_ro(void) } #endif -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 03da9030d0e..aae87456d93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -945,50 +945,6 @@ void __init mem_init(void) initsize >> 10); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -- cgit v1.2.3 From 867c5b5292583b1e474cbbcb4c77f09bfca3903c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 14:10:12 +0200 Subject: x86: set_highmem_pages_init() cleanup Impact: cleanup This patch moves set_highmem_pages_init() to arch/x86/mm/highmem_32.c. The declaration of the function is kept in asm/numa_32.h because asm/highmem.h is included only if CONFIG_HIGHMEM is enabled so we can't put the empty static inline function there. Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 12 ------------ arch/x86/mm/numa_32.c | 26 -------------------------- 3 files changed, 34 insertions(+), 38 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index bcc079c282d..13a823cf564 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,5 +1,6 @@ #include #include +#include /* for totalram_pages */ void *kmap(struct page *page) { @@ -156,3 +157,36 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); + +#ifdef CONFIG_NUMA +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +} +#else +static void __init set_highmem_pages_init(void) +{ + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); + + totalram_pages += totalhigh_pages; +} +#endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cd8d6732613..0b087dcd2c1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -467,22 +467,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, work_with_active_regions(nid, add_highpages_work_fn, &data); } -#ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* !CONFIG_NUMA */ - #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { } -static inline void set_highmem_pages_init(void) -{ -} #endif /* CONFIG_HIGHMEM */ void __init native_pagetable_setup_start(pgd_t *base) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..a04092a8acc 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -423,32 +423,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init set_highmem_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int nid; - - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } - totalram_pages += totalhigh_pages; -#endif -} - #ifdef CONFIG_MEMORY_HOTPLUG static int paddr_to_nid(u64 addr) { -- cgit v1.2.3 From 03787ceed8f7bf06af29f3b213017d89f8e9423d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Mar 2009 15:32:24 +0100 Subject: x86: set_highmem_pages_init() cleanup, fix !CONFIG_NUMA && CONFIG_HIGHMEM=y Impact: build fix arch/x86/mm/highmem_32.c:187: error: static declaration of 'set_highmem_pages_init' follows non-static declaration arch/x86/include/asm/numa_32.h:8: error: previous declaration of 'set_highmem_pages_init' was here Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 13a823cf564..00f127c80b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -183,7 +183,7 @@ void __init set_highmem_pages_init(void) totalram_pages += totalhigh_pages; } #else -static void __init set_highmem_pages_init(void) +void __init set_highmem_pages_init(void) { add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); -- cgit v1.2.3 From f254f3909efaf59ca2d0f408de2d044dace60706 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Mar 2009 12:02:57 -0800 Subject: x86: un-__init fill_pud/pmd/pte They are used by __set_fixmap->set_pte_vaddr_pud, which can be used by arch_setup_additional_pages(), and so is used after init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 11981fc8570..07f44d491df 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,7 +168,7 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { pud_t *pud = (pud_t *)spp_getpage(); @@ -180,7 +180,7 @@ static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) return pud_offset(pgd, vaddr); } -static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) { if (pud_none(*pud)) { pmd_t *pmd = (pmd_t *) spp_getpage(); @@ -192,7 +192,7 @@ static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) return pmd_offset(pud, vaddr); } -static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) { if (pmd_none(*pmd)) { pte_t *pte = (pte_t *) spp_getpage(); -- cgit v1.2.3