From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75..1f524df68b9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; int write, si_code; int fault; + unsigned long *stackend; + #ifdef CONFIG_X86_64 unsigned long flags; #endif @@ -850,6 +853,10 @@ no_context: show_fault_oops(regs, error_code, address); + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3 From dacf7333571d770366bff74d10b56aa545434605 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 7 Jan 2009 17:26:35 +0530 Subject: x86: smp.h move zap_low_mappings declartion to tlbflush.h Impact: cleanup, moving NON-SMP stuff from smp.h Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f99a6c6c432..a9dd0b7ad61 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,7 +49,6 @@ #include #include #include -#include unsigned int __VMALLOC_RESERVE = 128 << 20; -- cgit v1.2.3 From 92181f190b649f7ef2b79cbf5c00f26ccc66da2a Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 20 Jan 2009 04:24:26 +0100 Subject: x86: optimise x86's do_page_fault (C entry point for the page fault path) Impact: cleanup, restructure code to improve assembly gcc isn't _all_ that smart about spilling registers to stack or reusing stack slots, even with branch annotations. do_page_fault contained a lot of functionality, so split unlikely paths into their own functions, and mark them as noinline just to be sure. I consider this actually to be somewhat of a cleanup too: the main function now contains about half the number of lines so the normal path is easier to read, while the error cases are also nicely split away. Also, ensure the order of arguments to functions is always the same: regs, addr, error_code. This can reduce code size a tiny bit, and just looks neater too. And add a couple of branch annotations. Before: do_page_fault: subq $360, %rsp #, After: do_page_fault: subq $56, %rsp #, bloat-o-meter: add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542) function old new delta __bad_area_nosemaphore - 506 +506 no_context - 474 +474 vmalloc_fault - 424 +424 spurious_fault - 358 +358 mm_fault_error - 272 +272 bad_area_access_error - 89 +89 bad_area - 89 +89 bad_area_nosemaphore - 10 +10 do_page_fault 2464 784 -1680 Yes, the total size increases by 542 bytes, due to the extra function calls. But these will very rarely be called (except for vmalloc_fault) in a normal workload. Importantly, do_page_fault is less than 1/3rd it's original size, and touches far less stack. Existing gotos and branch hints did move a lot of the infrequently used text out of the fastpath, but that's even further improved after this patch. Signed-off-by: Nick Piggin Acked-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 438 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 256 insertions(+), 182 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 90dfae511a4..033292dc9e2 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner */ -static int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) +static int is_prefetch(struct pt_regs *regs, unsigned long error_code, + unsigned long addr) { unsigned char *instr; int scan_more = 1; @@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static noinline void pgtable_bad(struct pt_regs *regs, + unsigned long error_code, unsigned long address) { unsigned long flags = oops_begin(); int sig = SIGKILL; - struct task_struct *tsk; + struct task_struct *tsk = current; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); + tsk->comm, address); dump_pagetable(address); tsk = current; tsk->thread.cr2 = address; @@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, } #endif +static noinline void no_context(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long flags; + int sig; +#endif + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata93(regs, address)) + return; + + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + sig = SIGKILL; + if (__die("Oops", regs, error_code)) + sig = 0; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); +#endif +} + +static void __bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct task_struct *tsk = current; + + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, error_code, address)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( + "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *) regs->ip, (void *) regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + + no_context(regs, error_code, address); +} + +static noinline void bad_area_nosemaphore(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); +} + +static void __bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address, + int si_code) +{ + struct mm_struct *mm = current->mm; + + /* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ + up_read(&mm->mmap_sem); + + __bad_area_nosemaphore(regs, error_code, address, si_code); +} + +static noinline void bad_area(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_MAPERR); +} + +static noinline void bad_area_access_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + __bad_area(regs, error_code, address, SEGV_ACCERR); +} + +/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ +static void out_of_memory(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + /* + * We ran out of memory, call the OOM killer, and return the userspace + * (which will retry the fault, or kill us if we got oom-killed). + */ + up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); +} + +static void do_sigbus(struct pt_regs *regs, + unsigned long error_code, unsigned long address) +{ + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, error_code, address)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +static noinline void mm_fault_error(struct pt_regs *regs, + unsigned long error_code, unsigned long address, unsigned int fault) +{ + if (fault & VM_FAULT_OOM) + out_of_memory(regs, error_code, address); + else if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); +} + static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) @@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static int spurious_fault(unsigned long address, - unsigned long error_code) +static noinline int spurious_fault(unsigned long error_code, + unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address, * * This assumes no large pages in there. */ -static int vmalloc_fault(unsigned long address) +static noinline int vmalloc_fault(unsigned long address) { #ifdef CONFIG_X86_32 unsigned long pgd_paddr; @@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address) int show_unhandled_signals = 1; +static inline int access_error(unsigned long error_code, int write, + struct vm_area_struct *vma) +{ + if (write) { + /* write, present and write, not present */ + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + } else if (unlikely(error_code & PF_PROT)) { + /* read, present */ + return 1; + } else { + /* read, not present */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + } + + return 0; +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -583,16 +786,12 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { + unsigned long address; struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned long address; - int write, si_code; + int write; int fault; -#ifdef CONFIG_X86_64 - unsigned long flags; - int sig; -#endif tsk = current; mm = tsk->mm; @@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* get the address */ address = read_cr2(); - si_code = SEGV_MAPERR; - - if (notify_page_fault(regs)) + if (unlikely(notify_page_fault(regs))) return; if (unlikely(kmmio_fault(regs, address))) return; @@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) + if (spurious_fault(error_code, address)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock. */ - goto bad_area_nosemaphore; + bad_area_nosemaphore(regs, error_code, address); + return; } - /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. @@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); + pgtable_bad(regs, error_code, address); #endif /* * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault. */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; + if (unlikely(in_atomic() || !mm)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } /* * When running in the kernel we expect faults to occur only to @@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. */ - if (!down_read_trylock(&mm->mmap_sem)) { + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->ip)) - goto bad_area_nosemaphore; + !search_exception_tables(regs->ip)) { + bad_area_nosemaphore(regs, error_code, address); + return; + } down_read(&mm->mmap_sem); } vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) + if (unlikely(!vma)) { + bad_area(regs, error_code, address); + return; + } + if (likely(vma->vm_start <= address)) goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { + bad_area(regs, error_code, address); + return; + } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. @@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * and pusha to work. ("enter $65535,$31" pushes * 32 pointers and then decrements %sp by 65535.) */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) - goto bad_area; + if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { + bad_area(regs, error_code, address); + return; + } } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ + if (unlikely(expand_stack(vma, address))) { + bad_area(regs, error_code, address); + return; + } + + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; + write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { + bad_area_access_error(regs, error_code, address); + return; } /* @@ -738,11 +937,8 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, write); if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); + mm_fault_error(regs, error_code, address, fault); + return; } if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; @@ -760,128 +956,6 @@ good_area: } #endif up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata100(regs, address)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - - if (is_f00f_bug(regs, address)) - return; - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - * - * X86_64 - * Hall of shame of CPU/BIOS bugs. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else - flags = oops_begin(); -#endif - - show_fault_oops(regs, error_code, address); - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else - sig = SIGKILL; - if (__die("Oops", regs, error_code)) - sig = 0; - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, sig); -#endif - -out_of_memory: - /* - * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). - */ - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } DEFINE_SPINLOCK(pgd_lock); -- cgit v1.2.3 From 55f4949f5765e7a29863b6d17a774601810732f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:08:53 +0100 Subject: x86, mm: move tlb.c to arch/x86/mm/ Impact: cleanup Now that it's unified, move the (SMP) TLB flushing code from arch/x86/kernel/ to arch/x86/mm/, where it belongs logically. Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 + arch/x86/mm/tlb.c | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 arch/x86/mm/tlb.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d8cc96a2738..9f05157220f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,6 +1,8 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o +obj-$(CONFIG_X86_SMP) += tlb.o + obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c new file mode 100644 index 00000000000..b3ca1b94065 --- /dev/null +++ b/arch/x86/mm/tlb.c @@ -0,0 +1,296 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + +#include +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. + */ + +union smp_flush_state { + struct { + struct mm_struct *flush_mm; + unsigned long flush_va; + spinlock_t tlbstate_lock; + DECLARE_BITMAP(flush_cpumask, NR_CPUS); + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + load_cr3(swapper_pg_dir); +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. + * 1a2) set cpu mmu_state to TLBSTATE_OK + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * was in lazy tlb mode. + * 1a3) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles + * flush ipis. + * 1b1) set cpu mmu_state to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu mmu_state is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush IPI: + * + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. + */ + +/* + * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop + * but still used for documentation purpose but the usage is slightly + * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt + * entry calls in with the first parameter in %eax. Maybe define + * intrlinkage? + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void smp_invalidate_interrupt(struct pt_regs *regs) +{ + unsigned int cpu; + unsigned int sender; + union smp_flush_state *f; + + cpu = smp_processor_id(); + /* + * orig_rax contains the negated interrupt vector. + * Use that to determine where the sender put the data. + */ + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); + + if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask))) + goto out; + /* + * This was a BUG() but until someone can quote me the + * line from the intel manual that guarantees an IPI to + * multiple CPUs is retried _only_ on the erroring CPUs + * its staying as a return + * + * BUG(); + */ + + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(f->flush_va); + } else + leave_mm(cpu); + } +out: + ack_APIC_irq(); + smp_mb__before_clear_bit(); + cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask)); + smp_mb__after_clear_bit(); + inc_irq_stat(irq_tlb_count); +} + +static void flush_tlb_others_ipi(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + unsigned int sender; + union smp_flush_state *f; + + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpumask_andnot(to_cpumask(f->flush_cpumask), + cpumask, cpumask_of(smp_processor_id())); + + /* + * Make the above memory operations globally visible before + * sending the IPI. + */ + smp_mb(); + /* + * We have to send the IPI only to + * CPUs affected. + */ + send_IPI_mask(to_cpumask(f->flush_cpumask), + INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpumask_empty(to_cpumask(f->flush_cpumask))) + cpu_relax(); + + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va) +{ + if (is_uv_system()) { + unsigned int cpu; + + cpu = get_cpu(); + cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); + if (cpumask) + flush_tlb_others_ipi(cpumask, mm, va); + put_cpu(); + return; + } + flush_tlb_others_ipi(cpumask, mm, va); +} + +static int __cpuinit init_smp_flush(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); + + return 0; +} +core_initcall(init_smp_flush); + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + local_flush_tlb(); + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + preempt_enable(); +} + +void flush_tlb_mm(struct mm_struct *mm) +{ + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + local_flush_tlb(); + else + leave_mm(smp_processor_id()); + } + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(va); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(&mm->cpu_vm_mask, mm, va); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + unsigned long cpu = smp_processor_id(); + + __flush_tlb_all(); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(cpu); +} + +void flush_tlb_all(void) +{ + on_each_cpu(do_flush_tlb_all, NULL, 1); +} -- cgit v1.2.3 From 4ec71fa2d2c3f1040348f2604f4b8ccc833d1c2e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 21 Jan 2009 10:24:27 +0100 Subject: x86: uv cleanup, build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: arch/x86/mm/srat_64.c: In function ‘acpi_numa_processor_affinity_init’: arch/x86/mm/srat_64.c:141: error: implicit declaration of function ‘get_uv_system_type’ arch/x86/mm/srat_64.c:141: error: ‘UV_X2APIC’ undeclared (first use in this function) arch/x86/mm/srat_64.c:141: error: (Each undeclared identifier is reported only once arch/x86/mm/srat_64.c:141: error: for each function it appears in.) A couple of UV definitions were moved to asm/uv/uv.h, but srat_64.c did not include that header. Add it. Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 09737c8af07..15df1baee10 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -21,6 +21,7 @@ #include #include #include +#include int acpi_numa __initdata; -- cgit v1.2.3 From fb746d0e1365b7472ccc4c3d5b0672b34a092d0b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 21 Jan 2009 20:45:41 +0100 Subject: x86: optimise page fault entry, cleanup tsk is already assigned to current, drop the redundant second assignment. Signed-off-by: Johannes Weiner Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 033292dc9e2..8f4b859a04b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -419,7 +419,6 @@ static noinline void pgtable_bad(struct pt_regs *regs, printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk = current; tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; -- cgit v1.2.3 From d639bab8da86d330493487e8c0fea8ca31f53427 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:13 -0800 Subject: x86 PAT: ioremap_wc should take resource_size_t parameter Impact: fix/extend ioremap_wc() beyond 4GB aperture on 32-bit ioremap_wc() was taking in unsigned long parameter, where as it should take 64-bit resource_size_t parameter like other ioremap variants. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index bd85d42819e..2ddb1e79a19 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -367,7 +367,7 @@ EXPORT_SYMBOL(ioremap_nocache); * * Must be freed with iounmap. */ -void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) { if (pat_enabled) return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, -- cgit v1.2.3 From fe40c0af3cff3ea461cf25bddb979abc7279d4df Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 23 Jan 2009 15:49:41 -0800 Subject: x86: uaccess: introduce try and catch framework Impact: introduce new uaccess exception handling framework Introduce {get|put}_user_try and {get|put}_user_catch as new uaccess exception handling framework. {get|put}_user_try begins exception block and {get|put}_user_catch(err) ends the block and gets err if an exception occured in {get|put}_user_ex() in the block. The exception is stored thread_info->uaccess_err. The example usage of this framework is below; int func() { int err = 0; get_user_try { get_user_ex(...); get_user_ex(...); : } get_user_catch(err); return err; } Note: get_user_ex() is not clear the value when an exception occurs, it's different from the behavior of __get_user(), but I think it doesn't matter. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/extable.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 7e8db53528a..61b41ca3b5a 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -23,6 +23,12 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { + /* If fixup is less than 16, it means uaccess error */ + if (fixup->fixup < 16) { + current_thread_info()->uaccess_err = -EFAULT; + regs->ip += fixup->fixup; + return 1; + } regs->ip = fixup->fixup; return 1; } -- cgit v1.2.3 From 75a048119e76540d73132cfc8e0fa0c0a8bb6c83 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 22 Jan 2009 16:17:05 -0800 Subject: x86: handle PAT more like other CPU features Impact: Cleanup When PAT was originally introduced, it was handled specially for a few reasons: - PAT bugs are hard to track down, so we wanted to maintain a whitelist of CPUs. - The i386 and x86-64 CPUID code was not yet unified. Both of these are now obsolete, so handle PAT like any other features, including ordinary feature blacklisting due to known bugs. Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8b08fb95527..430cb44dd3f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -30,7 +30,7 @@ #ifdef CONFIG_X86_PAT int __read_mostly pat_enabled = 1; -void __cpuinit pat_disable(char *reason) +void __cpuinit pat_disable(const char *reason) { pat_enabled = 0; printk(KERN_INFO "%s\n", reason); @@ -42,6 +42,11 @@ static int __init nopat(char *str) return 0; } early_param("nopat", nopat); +#else +static inline void pat_disable(const char *reason) +{ + (void)reason; +} #endif @@ -78,16 +83,20 @@ void pat_init(void) if (!pat_enabled) return; - /* Paranoia check. */ - if (!cpu_has_pat && boot_pat_state) { - /* - * If this happens we are on a secondary CPU, but - * switched to PAT on the boot CPU. We have no way to - * undo PAT. - */ - printk(KERN_ERR "PAT enabled, " - "but not supported by secondary CPU\n"); - BUG(); + if (!cpu_has_pat) { + if (!boot_pat_state) { + pat_disable("PAT not supported by CPU."); + return; + } else { + /* + * If this happens we are on a secondary CPU, but + * switched to PAT on the boot CPU. We have no way to + * undo PAT. + */ + printk(KERN_ERR "PAT enabled, " + "but not supported by secondary CPU\n"); + BUG(); + } } /* Set PWT to Write-Combining. All other bits stay the same */ -- cgit v1.2.3 From 6470aff619fbb9dff8dfe8afa5033084cd55ca20 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 27 Jan 2009 12:56:47 +0900 Subject: x86: move 64-bit NUMA code Impact: Code movement, no functional change. Move the 64-bit NUMA code from setup_percpu.c to numa_64.c Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/mm/numa_64.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 71a14f89f89..08d140fbc31 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,6 +20,12 @@ #include #include +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -33,6 +39,21 @@ int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; +DEFINE_PER_CPU(int, node_number) = 0; +EXPORT_PER_CPU_SYMBOL(node_number); + +/* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +/* + * Which logical CPUs are on which nodes + */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -640,3 +661,199 @@ void __init init_cpu_to_node(void) #endif +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +void __cpuinit numa_set_node(int cpu, int node) +{ + int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + + /* early setting, no percpu area yet */ + if (cpu_to_node_map) { + cpu_to_node_map[cpu] = node; + return; + } + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS + if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); + dump_stack(); + return; + } +#endif + per_cpu(x86_cpu_to_node_map, cpu) = node; + + if (node != NUMA_NO_NODE) + per_cpu(node_number, cpu) = node; +} + +void __cpuinit numa_clear_node(int cpu) +{ + numa_set_node(cpu, NUMA_NO_NODE); +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +void __cpuinit numa_add_cpu(int cpu) +{ + cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +#else /* CONFIG_DEBUG_PER_CPU_MAPS */ + +/* + * --------- debug versions of the numa functions --------- + */ +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ + int node = early_cpu_to_node(cpu); + cpumask_t *mask; + char buf[64]; + + if (node_to_cpumask_map == NULL) { + printk(KERN_ERR "node_to_cpumask_map NULL\n"); + dump_stack(); + return; + } + + mask = &node_to_cpumask_map[node]; + if (enable) + cpu_set(cpu, *mask); + else + cpu_clear(cpu, *mask); + + cpulist_scnprintf(buf, sizeof(buf), mask); + printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); +} + +void __cpuinit numa_add_cpu(int cpu) +{ + numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ + numa_set_cpumask(cpu, 0); +} + +int cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) { + printk(KERN_WARNING + "cpu_to_node(%d): usage too early!\n", cpu); + dump_stack(); + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ + if (early_per_cpu_ptr(x86_cpu_to_node_map)) + return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + + if (!per_cpu_offset(cpu)) { + printk(KERN_WARNING + "early_cpu_to_node(%d): no per_cpu area!\n", cpu); + dump_stack(); + return NUMA_NO_NODE; + } + return per_cpu(x86_cpu_to_node_map, cpu); +} + + +/* empty cpumask */ +static const cpumask_t cpu_mask_none; + +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const cpumask_t *cpumask_of_node(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return (const cpumask_t *)&cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return &cpu_mask_none; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); + +/* + * Returns a bitmask of CPUs on Node 'node'. + * + * Side note: this function creates the returned cpumask on the stack + * so with a high NR_CPUS count, excessive stack space is used. The + * node_to_cpumask_ptr function should be used whenever possible. + */ +cpumask_t node_to_cpumask(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); + dump_stack(); + return cpu_online_map; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "node_to_cpumask(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_mask_none; + } + return node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(node_to_cpumask); + +/* + * --------- end of debug versions of the numa functions --------- + */ + +#endif /* CONFIG_DEBUG_PER_CPU_MAPS */ -- cgit v1.2.3 From dac5f4121df3c39fdb2ea57acd669a0ae19e46f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 15:42:24 +0100 Subject: x86, apic: untangle the send_IPI_*() jungle Our send_IPI_*() methods and definitions are a twisted mess: the same symbol is defined to different things depending on .config details, in a non-transparent way. - spread out the quirks into separately named per apic driver methods - prefix the standard PC methods with default_ - get rid of wrapper macro obfuscation - clean up various details Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 72a6d4ebe34..6348e114692 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -196,7 +196,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(to_cpumask(f->flush_cpumask), + apic->send_IPI_mask(to_cpumask(f->flush_cpumask), INVALIDATE_TLB_VECTOR_START + sender); while (!cpumask_empty(to_cpumask(f->flush_cpumask))) -- cgit v1.2.3 From d53e2f2855f1c7c2725d550c1ae6b26f4d671c50 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 Jan 2009 19:14:52 +0100 Subject: x86, smp: remove mach_ipi.h Move mach_ipi.h definitions into genapic.h. Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6348e114692..14c5af4d11e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From 3e5095d15276efd14a45393666b1bb7536bf179f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 27 Jan 2009 17:07:08 +0100 Subject: x86: replace CONFIG_X86_SMP with CONFIG_SMP The x86/Voyager subarch used to have this distinction between 'x86 SMP support' and 'Voyager SMP support': config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) This is a pointless distinction - Voyager can (and already does) use smp_ops to implement various SMP quirks it has - and it can be extended more to cover all the specialities of Voyager. So remove this complication in the Kconfig space. Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9f05157220f..2b938a38491 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,7 +1,7 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o -obj-$(CONFIG_X86_SMP) += tlb.o +obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o -- cgit v1.2.3 From 010060741ad35eacb504414bc6fb9bb575b15f62 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jan 2009 16:02:12 +0100 Subject: x86: add might_sleep() to do_page_fault() Impact: widen debug checks VirtualBox calls do_page_fault() from an atomic context but runs into a might_sleep() way pas this point, cure that. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8f4b859a04b..eb4d7fe0593 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -888,6 +888,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in which + * case we'll have missed the might_sleep() from down_read(). + */ + might_sleep(); } vma = find_vma(mm, address); -- cgit v1.2.3 From 8f47e16348e8e25eedf639092a8a2f10a66aba34 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 31 Jan 2009 02:03:42 +0100 Subject: x86: update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 56fe7124fbe..16582960056 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -4,7 +4,7 @@ * Based on code by Ingo Molnar and Andi Kleen, copyrighted * as follows: * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2003-2009 Red Hat Inc. * All Rights Reserved. * Copyright 2005 Andi Kleen, SUSE Labs. * Copyright 2007 Jiri Kosina, SUSE Labs. -- cgit v1.2.3 From 0973a06cde8cc1522fbcf2baacb926f1ee3f4c79 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 4 Feb 2009 15:24:09 -0800 Subject: x86: mm: introduce helper function in fault.c Impact: cleanup Introduce helper function fault_in_kernel_address() to make editors happy. Signed-off-by: Hiroshi Shimamoto Signed-off-by: H. Peter Anvin --- arch/x86/mm/fault.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index eb4d7fe0593..8e9b0f1fd87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -775,6 +775,15 @@ static inline int access_error(unsigned long error_code, int write, return 0; } +static int fault_in_kernel_space(unsigned long address) +{ +#ifdef CONFIG_X86_32 + return address >= TASK_SIZE; +#else /* !CONFIG_X86_32 */ + return address >= TASK_SIZE64; +#endif /* CONFIG_X86_32 */ +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -817,11 +826,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ -#ifdef CONFIG_X86_32 - if (unlikely(address >= TASK_SIZE)) { -#else - if (unlikely(address >= TASK_SIZE64)) { -#endif + if (unlikely(fault_in_kernel_space(address))) { if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; -- cgit v1.2.3 From 44581a28e805a31661469c4b466b9cd14b36e7b6 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 8 Feb 2009 09:58:40 -0500 Subject: x86: fix abuse of per_cpu_offset Impact: bug fix Don't use per_cpu_offset() to determine if it valid to access a per-cpu variable for a given cpu number. It is not a valid assumption on x86-64 anymore. Use cpu_possible() instead. Signed-off-by: Brian Gerst Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 08d140fbc31..deb1c1ab786 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -702,7 +702,7 @@ void __cpuinit numa_set_node(int cpu, int node) } #ifdef CONFIG_DEBUG_PER_CPU_MAPS - if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); dump_stack(); return; @@ -790,7 +790,7 @@ int early_cpu_to_node(int cpu) if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; - if (!per_cpu_offset(cpu)) { + if (!cpu_possible(cpu)) { printk(KERN_WARNING "early_cpu_to_node(%d): no per_cpu area!\n", cpu); dump_stack(); -- cgit v1.2.3 From 7651194fb715b2d57658c05a710408f6b8448951 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 11 Feb 2009 22:26:52 +0530 Subject: x86: mm/init_32.c fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/x86/mm/init_32.c: In function ‘find_low_pfn_range’: arch/x86/mm/init_32.c:696: warning: format ‘%u’ expects type ‘unsigned int’, but Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef0507441..d48f2560364 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -692,7 +692,7 @@ void __init find_low_pfn_range(void) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %uMB.\n", + "available, ignoring highmem size of %luMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3 From 3023533de43c5c01c660e1b48d3700b028eb4615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:28:43 +0100 Subject: x86: fix warning in find_low_pfn_range() Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d48f2560364..e77459dd38a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -722,7 +722,7 @@ void __init find_low_pfn_range(void) highmem_pages = 0; #ifdef CONFIG_HIGHMEM if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is " + printk(KERN_ERR "highmem size specified (%luMB) is " "bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); @@ -731,7 +731,7 @@ void __init find_low_pfn_range(void) if (highmem_pages) { if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in " + printk(KERN_ERR "highmem size %luMB results in " "smaller than 64MB lowmem, ignoring it.\n" , pages_to_mb(highmem_pages)); highmem_pages = 0; -- cgit v1.2.3 From 4769843bc265a9c24584b98709cf39e1df5c1404 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:31:41 +0100 Subject: x86, 32-bit: clean up find_low_pfn_range() Impact: cleanup Split find_low_pfn_range() into two functions: - lowmem_pfn_init() - highmem_pfn_init() The former gets called if all of RAM fits into lowmem, otherwise we call highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 137 +++++++++++++++++++++++++++++--------------------- 1 file changed, 79 insertions(+), 58 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index e77459dd38a..9d36eb9ebd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -675,75 +675,96 @@ static int __init parse_highmem(char *arg) } early_param("highmem", parse_highmem); +#define MSG_HIGHMEM_TOO_BIG \ + "highmem size (%luMB) is bigger than pages available (%luMB)!\n" + +#define MSG_LOWMEM_TOO_SMALL \ + "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n" /* - * Determine low and high memory ranges: + * All of RAM fits into lowmem - but if user wants highmem + * artificially via the highmem=x boot parameter then create + * it: */ -void __init find_low_pfn_range(void) +void __init lowmem_pfn_init(void) { - /* it could update max_pfn */ - - /* max_low_pfn is 0, we already have early_res support */ - - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk(KERN_WARNING "only %luMB highmem pages " - "available, ignoring highmem size of %luMB.\n", - pages_to_mb(max_pfn - MAXMEM_PFN), + if (highmem_pages == -1) + highmem_pages = 0; +#ifdef CONFIG_HIGHMEM + if (highmem_pages >= max_pfn) { + printk(KERN_ERR MSG_HIGHMEM_TOO_BIG, + pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); + highmem_pages = 0; + } + if (highmem_pages) { + if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) { + printk(KERN_ERR MSG_LOWMEM_TOO_SMALL, pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; + max_low_pfn -= highmem_pages; + } +#else + if (highmem_pages) + printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); +#endif +} + +#define MSG_HIGHMEM_TOO_SMALL \ + "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" + +#define MSG_HIGHMEM_TRIMMED \ + "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" +/* + * We have more RAM than fits into lowmem - we try to put it into + * highmem, also taking the highmem=x boot parameter into account: + */ +void __init highmem_pfn_init(void) +{ + if (highmem_pages == -1) + highmem_pages = max_pfn - MAXMEM_PFN; + + if (highmem_pages + MAXMEM_PFN < max_pfn) + max_pfn = MAXMEM_PFN + highmem_pages; + + if (highmem_pages + MAXMEM_PFN > max_pfn) { + printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, + pages_to_mb(max_pfn - MAXMEM_PFN), + pages_to_mb(highmem_pages)); + highmem_pages = 0; + } + max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING - "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; + /* Maximum memory usable is what is directly addressable */ + printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); + if (max_pfn > MAX_NONPAE_PFN) + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); + else + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ #ifndef CONFIG_HIGHMEM64G - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used." - "Use a HIGHMEM64G enabled kernel.\n"); - } + if (max_pfn > MAX_NONPAE_PFN) { + max_pfn = MAX_NONPAE_PFN; + printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); + } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%luMB) is " - "bigger than pages available (%luMB)!.\n", - pages_to_mb(highmem_pages), - pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn - highmem_pages < - 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %luMB results in " - "smaller than 64MB lowmem, ignoring it.\n" - , pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem" - " kernel!\n"); -#endif - } +} + +/* + * Determine low and high memory ranges: + */ +void __init find_low_pfn_range(void) +{ + /* it could update max_pfn */ + + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + + if (max_low_pfn > MAXMEM_PFN) + highmem_pfn_init(); + else + lowmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3 From d88316c243e5458a1888edbe0353c4dec6e61c73 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 15:16:03 +0100 Subject: x86, 32-bit: refactor find_low_pfn_range() Impact: cleanup Make the max_low_pfn logic a bit more standard between lowmem_pfn_init() and highmem_pfn_init(). Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9d36eb9ebd5..1a9612499a3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -687,6 +687,9 @@ early_param("highmem", parse_highmem); */ void __init lowmem_pfn_init(void) { + /* max_low_pfn is 0, we already have early_res support */ + max_low_pfn = max_pfn; + if (highmem_pages == -1) highmem_pages = 0; #ifdef CONFIG_HIGHMEM @@ -720,6 +723,8 @@ void __init lowmem_pfn_init(void) */ void __init highmem_pfn_init(void) { + max_low_pfn = MAXMEM_PFN; + if (highmem_pages == -1) highmem_pages = max_pfn - MAXMEM_PFN; @@ -732,7 +737,6 @@ void __init highmem_pfn_init(void) pages_to_mb(highmem_pages)); highmem_pages = 0; } - max_low_pfn = MAXMEM_PFN; #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); @@ -758,13 +762,10 @@ void __init find_low_pfn_range(void) { /* it could update max_pfn */ - /* max_low_pfn is 0, we already have early_res support */ - max_low_pfn = max_pfn; - - if (max_low_pfn > MAXMEM_PFN) - highmem_pfn_init(); - else + if (max_pfn <= MAXMEM_PFN) lowmem_pfn_init(); + else + highmem_pfn_init(); } #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit v1.2.3 From 7b6aa335ca1a845c2262ec7a595b4521bca0f79d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 13:58:15 +0100 Subject: x86, apic: remove genapic.h Impact: cleanup Remove genapic.h and remove all references to it. Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 2 +- arch/x86/mm/tlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 15df1baee10..574c8bc95ef 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include int acpi_numa __initdata; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 14c5af4d11e..b641349fe07 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include +#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From e641f5f525acb163ba71d92de79c9c7366deae03 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 14:02:01 +0100 Subject: x86, apic: remove duplicate asm/apic.h inclusions Impact: cleanup Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index b641349fe07..a654d59e448 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,7 +14,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0, }; -#include /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.2.3 From 11124411aa95827404d6bfdfc14c908e1b54513c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:09 +0900 Subject: x86: convert to the new dynamic percpu allocator Impact: use new dynamic allocator, unified access to static/dynamic percpu memory Convert to the new dynamic percpu allocator. * implement populate_extra_pte() for both 32 and 64 * update setup_per_cpu_areas() to use pcpu_setup_static() * define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() * define config HAVE_DYNAMIC_PER_CPU_AREA Signed-off-by: Tejun Heo --- arch/x86/mm/init_32.c | 10 ++++++++++ arch/x86/mm/init_64.c | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00263bf07a8..8b1a0ef7f87 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,6 +137,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } +void __init populate_extra_pte(unsigned long vaddr) +{ + int pgd_idx = pgd_index(vaddr); + int pmd_idx = pmd_index(vaddr); + pmd_t *pmd; + + pmd = one_md_table_init(swapper_pg_dir + pgd_idx); + one_page_table_init(pmd + pmd_idx); +} + static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, unsigned long vaddr, pte_t *lastpte) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..7f91e2cdc4c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,25 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } +void __init populate_extra_pte(unsigned long vaddr) +{ + pgd_t *pgd; + pud_t *pud; + + pgd = pgd_offset_k(vaddr); + if (pgd_none(*pgd)) { + pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) { + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + return; + } + } + + set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); +} + /* * Create large page table mappings for a range of physical addresses. */ -- cgit v1.2.3 From 3c3e5694add02e665bbbd0fecfbbdcc0b903097a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 11:46:36 -0500 Subject: x86: check PMD in spurious_fault handler Impact: fix to prevent hard lockup on bad PMD permissions If the PMD does not have the correct permissions for a page access, but the PTE does, the spurious fault handler will mistake the fault as a lazy TLB transaction. This will result in an infinite loop of: fault -> spurious_fault check (pass) -> return to code -> fault This patch adds a check and a warn on if the PTE passes the permissions but the PMD does not. [ Updated: Ingo Molnar suggested using WARN_ONCE with some text ] Signed-off-by: Steven Rostedt --- arch/x86/mm/fault.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c76ef1d701c..278d645d108 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -455,6 +455,7 @@ static int spurious_fault(unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -482,7 +483,17 @@ static int spurious_fault(unsigned long address, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; + + /* + * Make sure we have permissions in PMD + * If not, then there's a bug in the page tables. + */ + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* -- cgit v1.2.3 From 7a5714e0186030676d79a7b4b9830c8e45c3b0a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 17:44:21 +0100 Subject: x86, pat: add large-PAT check to split_large_page() Impact: future-proof the split_large_page() function Linus noticed that split_large_page() is not safe wrt. the PAT bit: it is bit 12 on the 1GB and 2MB page table level (_PAGE_BIT_PAT_LARGE), and it is bit 7 on the 4K page table level (_PAGE_BIT_PAT). Currently it is not a problem because we never set _PAGE_BIT_PAT_LARGE on any of the large-page mappings - but should this happen in the future the split_large_page() would silently lift bit 12 into the lowlevel 4K pte and would start corrupting the physical page frame offset. Not fun. So add a debug warning, to make sure if something ever sets the PAT bit then this function gets updated too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7be47d1a97e..8253bc97587 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { -- cgit v1.2.3 From 2d4a71676f4d89418a0d53e60b89e8b804b390b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 19:56:40 +0100 Subject: x86, mm: fault.c cleanup Impact: cleanup, no code changed Clean up various small details, which can be correctness checked automatically: - tidy up the include file section - eliminate unnecessary includes - introduce show_signal_msg() to clean up code flow - standardize the code flow - standardize comments and other style details - more cleanups, pointed out by checkpatch No code changed on either 32-bit nor 64-bit: arch/x86/mm/fault.o: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4632 32 24 4688 1250 fault.o.after the md5 changed due to a change in a single instruction: 2e8a8241e7f0d69706776a5a26c90bc0 fault.o.before.asm c5c3d36e725586eb74f0e10692f0193e fault.o.after.asm Because a __LINE__ reference in a WARN_ONCE() has changed. On 32-bit a few stack offsets changed - no code size difference nor any functionality difference. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 546 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 331 insertions(+), 215 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4b9fc5001c..351d679bf97 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,56 +1,59 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include /* For unblank_screen() */ +#include +#include #include #include -#include /* for max_low_pfn */ -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include + +#include -#include -#include -#include -#include -#include #include +#include +#include +#include #include -#include #include +#include /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { @@ -82,23 +85,27 @@ static inline int notify_page_fault(struct pt_regs *regs) } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: + * + * 32-bit mode: + * + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 64-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { + unsigned char *max_instr; unsigned char *instr; int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -114,9 +121,9 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return 0; while (scan_more && instr < max_instr) { - unsigned char opcode; unsigned char instr_hi; unsigned char instr_lo; + unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; @@ -173,15 +180,17 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } @@ -189,6 +198,7 @@ static void force_sig_info_fault(int si_signo, int si_code, static int bad_address(void *p) { unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); } #endif @@ -200,13 +210,14 @@ static void dump_pagetable(unsigned long address) page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,14 +229,15 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } @@ -239,26 +251,38 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: @@ -285,7 +309,6 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) * and redundant with the set_pmd() on non-PAE. As would * set_pud. */ - pud = pud_offset(pgd, address); pud_k = pud_offset(pgd_k, address); if (!pud_present(*pud_k)) @@ -295,11 +318,14 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd_k = pmd_offset(pud_k, address); if (!pmd_present(*pmd_k)) return NULL; + if (!pmd_present(*pmd)) { set_pmd(pmd, *pmd_k); arch_flush_lazy_mmu_mode(); - } else + } else { BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + return pmd_k; } #endif @@ -312,29 +338,37 @@ KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; #endif -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +378,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +398,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,8 +414,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { #ifdef CONFIG_X86_32 if (!oops_may_print()) @@ -389,12 +426,14 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, #ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) + if (pte && pte_present(*pte) && !pte_exec(*pte)) { printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " "(uid: %d)\n", current_uid()); + } } #endif @@ -403,34 +442,45 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } #ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } #endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; @@ -440,18 +490,20 @@ static noinline void no_context(struct pt_regs *regs, int sig; #endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,7 +513,7 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ #ifdef CONFIG_X86_32 bust_spinlocks(1); @@ -471,7 +523,7 @@ static noinline void no_context(struct pt_regs *regs, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); @@ -487,28 +539,54 @@ static noinline void no_context(struct pt_regs *regs, sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); #endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +594,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +613,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +635,77 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); + #ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + /* User space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; #endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,16 +713,19 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; @@ -678,20 +764,23 @@ static noinline int spurious_fault(unsigned long error_code, return 0; /* - * Make sure we have permissions in PMD - * If not, then there's a bug in the page tables. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ ret = spurious_fault_check(error_code, (pte_t *) pmd); WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + return ret; } /* - * X86_32 - * Handle a fault on the vmalloc or module mapping area + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area * - * X86_64 - * Handle a fault on the vmalloc area + * 64-bit: + * + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -702,7 +791,7 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd_k; pte_t *pte_k; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; @@ -717,9 +806,11 @@ static noinline int vmalloc_fault(unsigned long address) pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); if (!pmd_k) return -1; + pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) return -1; + return 0; #else pgd_t *pgd, *pgd_ref; @@ -727,69 +818,84 @@ static noinline int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; - /* Make sure we are in vmalloc area */ + /* Make sure we are in vmalloc area: */ if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ pgd = pgd_offset(current->active_mm, address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - /* Below here mismatches are bugs because these lower tables - are shared */ + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ pud = pud_offset(pgd, address); pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); + pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); if (pmd_none(*pmd_ref)) return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); if (!pte_present(*pte_ref)) return -1; + pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); + return 0; #endif } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } @@ -797,9 +903,9 @@ static int fault_in_kernel_space(unsigned long address) { #ifdef CONFIG_X86_32 return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ +#else return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ +#endif } /* @@ -812,18 +918,19 @@ asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -847,22 +954,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -870,13 +978,15 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } #ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) @@ -884,8 +994,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) #endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -894,19 +1004,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -917,8 +1027,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -938,7 +1049,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -957,6 +1068,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -965,13 +1077,15 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else @@ -1004,13 +1118,13 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { + unsigned long flags; struct page *page; spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) + if (!vmalloc_sync_one(page_address(page), address)) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -1018,12 +1132,14 @@ void vmalloc_sync_all(void) #else /* CONFIG_X86_64 */ for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; + spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; -- cgit v1.2.3 From 107a03678cac0dd6cf7095f81442a4fa477e4700 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 20:37:05 +0100 Subject: x86, mm: fault.c, refactor/simplify the is_prefetch() code Impact: no functionality changed Factor out the opcode checker into a helper inline. The code got a tiny bit smaller: text data bss dec hex filename 4632 32 24 4688 1250 fault.o.before 4618 32 24 4674 1242 fault.o.after And it got cleaner / easier to review as well. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 100 ++++++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 351d679bf97..72973c7682b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -99,12 +99,58 @@ static inline int notify_page_fault(struct pt_regs *regs) * * Opcode checker based on code by Richard Brunner. */ +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) +{ + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + static int is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) { unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; /* @@ -114,68 +160,22 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { - unsigned char instr_hi; - unsigned char instr_lo; + while (instr < max_instr) { unsigned char opcode; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } } return prefetch; } -- cgit v1.2.3 From 8c938f9fae887f6e180bf802aa1c33cf74712aff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:12:18 +0100 Subject: x86, mm: fault.c, factor out the vm86 fault check Impact: cleanup Instead of an ugly, open-coded, #ifdef-ed vm86 related legacy check in do_page_fault(), put it into the check_v8086_mode() helper function and merge it with an existing #ifdef. Also, simplify the code flow a tiny bit in the helper. No code changed: arch/x86/mm/fault.o: text data bss dec hex filename 2711 12 12 2735 aaf fault.o.before 2711 12 12 2735 aaf fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 72973c7682b..7dc0615c3cf 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -328,14 +328,41 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -#endif -#ifdef CONFIG_X86_64 +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; +} + +#else /* CONFIG_X86_64: */ + static const char errata93_warning[] = KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" KERN_ERR "******* Please consider a BIOS update.\n" KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + #endif /* @@ -1091,16 +1118,8 @@ good_area: else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif + check_v8086_mode(regs, address, tsk); + up_read(&mm->mmap_sem); } -- cgit v1.2.3 From 121d5d0a7e5808fbcfda484efd7ba840ac93450f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:18:08 +0100 Subject: x86, mm: fault.c, enable PF_RSVD checks on 32-bit too Impact: improve page fault handling robustness The 'PF_RSVD' flag (bit 3) of the page-fault error_code is a relatively recent addition to x86 CPUs, so the 32-bit do_fault() implementation never had it. This flag gets set when the CPU detects nonzero values in any reserved bits of the page directory entries. Extend the existing 64-bit check for PF_RSVD in do_page_fault() to 32-bit too. If we detect such a fault then we print a more informative oops and the pagetables. This unifies the code some more, removes an ugly #ifdef and improves the 32-bit page fault code robustness a bit. It slightly increases the 32-bit kernel text size. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7dc0615c3cf..3e366146273 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -477,7 +477,6 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, dump_pagetable(address); } -#ifdef CONFIG_X86_64 static noinline void pgtable_bad(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -503,7 +502,6 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, oops_end(flags, regs, sig); } -#endif static noinline void no_context(struct pt_regs *regs, unsigned long error_code, @@ -1015,10 +1013,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) local_irq_enable(); } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* * If we're in an interrupt, have no user context or are running -- cgit v1.2.3 From b814d41f0987c7648d7ed07471258101c95c026b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:32:10 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault() Impact: cleanup Remove an #ifdef from kmmio_fault() - we can do this by providing default implementations for is_kmmio_active() and kmmio_handler(). The compiler optimizes it all away in the !CONFIG_MMIOTRACE case. Also, while at it, clean up mmiotrace.h a bit: - standard header guards - standard vertical spaces for structure definitions No code changed (both with mmiotrace on and off in the config): text data bss dec hex filename 2947 12 12 2971 b9b fault.o.before 2947 12 12 2971 b9b fault.o.after Cc: Pekka Paalanen Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3e366146273..fe99af4b86d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -55,13 +55,14 @@ enum x86_pf_error_code { PF_INSTR = 1 << 4, }; +/* + * (returns 0 if mmiotrace is disabled) + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } -- cgit v1.2.3 From b18018126f422f5b706fd750373425e10e84b486 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:42:57 +0100 Subject: x86, mm, kprobes: fault.c, simplify notify_page_fault() Impact: cleanup Remove an #ifdef from notify_page_fault(). The function still compiles to nothing in the !CONFIG_KPROBES case. Introduce kprobes_built_in() and kprobe_fault_handler() helpers to allow this - they returns 0 if !CONFIG_KPROBES. No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fe99af4b86d..379beaec6ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -68,11 +68,10 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -80,9 +79,6 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* -- cgit v1.2.3 From f2f13a8535174dbb813a0607a9d4737cfba98f6c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 22:50:24 +0100 Subject: x86, mm: fault.c, reorder functions Impact: cleanup Avoid a couple more #ifdefs by moving fundamentally non-unifiable functions into a single #ifdef 32-bit / #else / #endif block in fault.c: vmalloc*(), dump_pagetable(), check_vm8086_mode(). No code changed: text data bss dec hex filename 4618 32 24 4674 1242 fault.o.before 4618 32 24 4674 1242 fault.o.after Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 474 ++++++++++++++++++++++++++-------------------------- 1 file changed, 239 insertions(+), 235 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 379beaec6ca..4ce62fb80da 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -191,18 +191,124 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; - return probe_kernel_address((unsigned long *)p, dummy); + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); @@ -239,7 +345,132 @@ static void dump_pagetable(unsigned long address) } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -284,83 +515,9 @@ out: return; bad: printk("BAD\n"); -#endif -} - -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } - - return pmd_k; -} - -/* - * Did it hit the DOS screen memory VA from vm86 mode? - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ - unsigned long bit; - - if (!v8086_mode(regs)) - return; - - bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; -} - -#else /* CONFIG_X86_64: */ - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* - * No vm86 mode in 64-bit mode: - */ -static inline void -check_v8086_mode(struct pt_regs *regs, unsigned long address, - struct task_struct *tsk) -{ } -#endif +#endif /* CONFIG_X86_64 */ /* * Workaround for K8 erratum #93 & buggy BIOS. @@ -795,109 +952,6 @@ spurious_fault(unsigned long error_code, unsigned long address) return ret; } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - * - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -#endif -} - int show_unhandled_signals = 1; static inline int @@ -1115,53 +1169,3 @@ good_area: up_read(&mm->mmap_sem); } - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif -} -- cgit v1.2.3 From 8f7661496cece8320137d5e26808825498fd2b26 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:00:29 +0100 Subject: x86, mm: fault.c, unify oops printing Impact: refine/extend page fault related oops printing on 64-bit - honor the pause_on_oops logic on 64-bit too - print out NX fault warnings on 64-bit as well - factor out the NX fault message to make it git-greppable and readable Note that this means that we do the PF_INSTR check on 32-bit non-PAE as well where it should not occur ... normally. Cannot hurt. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4ce62fb80da..ebfaca3bbb1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -595,28 +595,24 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; pte_t *pte = lookup_address(address, &level); - if (pte && pte_present(*pte) && !pte_exec(*pte)) { - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); - } + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) -- cgit v1.2.3 From 1cc99544dde9e48602979f16b9309fade6e93051 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:07:48 +0100 Subject: x86, mm: fault.c, unify oops handling Impact: add oops-recursion check to 32-bit Unify the oops state-machine, to the 64-bit version. It is slightly more careful in that it does a recursion check in oops_begin(), and is thus more likely to show the relevant oops. It also means that 32-bit will print one more line at the end of pagefault triggered oopses: printk(KERN_EMERG "CR2: %016lx\n", address); Which is generally good information to be seen in partial-dump digital-camera jpegs ;-) The downside is the somewhat more complex critical path. Both variants have been tested well meanwhile by kernel developers crashing their boxes so i dont think this is a practical worry. This removes 3 ugly #ifdefs from no_context() and makes the function a lot nicer read. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ebfaca3bbb1..8fe2dd254df 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -659,11 +659,8 @@ no_context(struct pt_regs *regs, unsigned long error_code, { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) @@ -690,11 +687,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); @@ -702,15 +695,10 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; @@ -719,7 +707,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, printk(KERN_EMERG "CR2: %016lx\n", address); oops_end(flags, regs, sig); -#endif } /* -- cgit v1.2.3 From c3731c68668325abddee8665018c74c7156a57be Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:22:34 +0100 Subject: x86, mm: fault.c, remove #ifdef from do_page_fault() Impact: cleanup do_page_fault() has this ugly #ifdef in its prototype: #ifdef CONFIG_X86_64 asmlinkage #endif void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) Replace it with 'dotraplinkage' which maps to exactly the above construct: nothing on 32-bit and asmlinkage on 64-bit. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8fe2dd254df..9c2dc5d7953 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -972,10 +972,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; -- cgit v1.2.3 From d951734654f76a370a89b4e531af9b765bd13541 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:32:28 +0100 Subject: x86, mm: rename TASK_SIZE64 => TASK_SIZE_MAX Impact: cleanup Rename TASK_SIZE64 to TASK_SIZE_MAX, and provide the define on 32-bit too. (mapped to TASK_SIZE) This allows 32-bit code to make use of the (former-) TASK_SIZE64 symbol as well, in a clean way. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c2dc5d7953..6fa9f175cba 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -963,7 +963,7 @@ static int fault_in_kernel_space(unsigned long address) #ifdef CONFIG_X86_32 return address >= TASK_SIZE; #else - return address >= TASK_SIZE64; + return address >= TASK_SIZE_MAX; #endif } -- cgit v1.2.3 From 7c178a26d3e753d2a4346d3e4b8aa549d387f698 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:27:29 +0100 Subject: x86, mm: fault.c, remove #ifdef from fault_in_kernel_space() Impact: cleanup Removal of an #ifdef in fault_in_kernel_space(), by making use of the new TASK_SIZE_MAX symbol which is now available on 32-bit too. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6fa9f175cba..f195691ec26 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -960,11 +960,7 @@ access_error(unsigned long error_code, int write, struct vm_area_struct *vma) static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else return address >= TASK_SIZE_MAX; -#endif } /* -- cgit v1.2.3 From cd1b68f08f6328737928e5b8ba8ef80394680ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:39:02 +0100 Subject: x86, mm: fault.c, give another attempt at prefetch handing before SIGBUS Impact: extend prefetch handling on 64-bit Currently there's an extra is_prefetch() check done in do_sigbus(), which we only do on 32 bits. This is a last-ditch check before we terminate a task, so it's worth giving prefetch instructions another chance - should none of our existing quirks have caught a prefetch instruction related spurious fault. The only risk is if a prefetch causes a real sigbus, in that case we'll not OOM but try another fault. But this code has been on 32-bit for a long time, so it should be fine in practice. So do this on 64-bit too - and thus remove one more #ifdef. Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f195691ec26..413e835e4a8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,11 +836,9 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault: */ + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif tsk->thread.cr2 = address; tsk->thread.error_code = error_code; -- cgit v1.2.3 From f8eeb2e6be367d79be3617f0a12646bced8b2fe6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Feb 2009 23:13:36 +0100 Subject: x86, mm: fault.c, update copyrights Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 413e835e4a8..9bb07d331c3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,6 +1,7 @@ /* * Copyright (C) 1995 Linus Torvalds * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ #include #include -- cgit v1.2.3 From 2366c298b5afe52e635afd5604b69ce9fd4471fc Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Sun, 22 Feb 2009 01:01:13 +0100 Subject: x86: numa_32.c: fix sparse warning: Using plain integer as NULL pointer Fix this sparse warning: arch/x86/mm/numa_32.c:197:24: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..3957cd6d645 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; + return NULL; node_remap_alloc_vaddr[nid] += size; memset(allocation, 0, size); -- cgit v1.2.3 From b319eed0aa0a6d710887350a3cb734c572aa64c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Feb 2009 10:24:18 +0100 Subject: x86, mm: fault.c, simplify kmmio_fault(), cleanup Clarify the kmmio_fault() comment. Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9bb07d331c3..a03b7279efa 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -57,7 +57,8 @@ enum x86_pf_error_code { }; /* - * (returns 0 if mmiotrace is disabled) + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -- cgit v1.2.3 From 458a3e644c3327be529393982e24277eda8f1ac7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Feb 2009 11:57:21 +0900 Subject: x86: update populate_extra_pte() and add populate_extra_pmd() Impact: minor change to populate_extra_pte() and addition of pmd flavor Update populate_extra_pte() to return pointer to the pte_t for the specified address and add populate_extra_pmd() which only populates till the pmd and returns pointer to the pmd entry for the address. For 64bit, pud/pmd/pte fill functions are separated out from set_pte_vaddr[_pud]() and used for set_pte_vaddr[_pud]() and populate_extra_{pte|pmd}(). Signed-off-by: Tejun Heo --- arch/x86/mm/init_32.c | 13 ++++++--- arch/x86/mm/init_64.c | 75 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 55 insertions(+), 33 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8b1a0ef7f87..84a26883ab4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -137,14 +137,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) return pte_offset_kernel(pmd, 0); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { int pgd_idx = pgd_index(vaddr); int pmd_idx = pmd_index(vaddr); + + return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx; +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + int pte_idx = pte_index(vaddr); pmd_t *pmd; - pmd = one_md_table_init(swapper_pg_dir + pgd_idx); - one_page_table_init(pmd + pmd_idx); + pmd = populate_extra_pmd(vaddr); + return one_page_table_init(pmd) + pte_idx; } static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7f91e2cdc4c..7d4e76da336 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,34 +168,51 @@ static __ref void *spp_getpage(void) return ptr; } -void -set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, pud); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} - pud = pud_page + pud_index(vaddr); +static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +{ if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); - if (pmd != pmd_offset(pud, 0)) { + if (pmd != pmd_offset(pud, 0)) printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", - pmd, pmd_offset(pud, 0)); - return; - } + pmd, pmd_offset(pud, 0)); } - pmd = pmd_offset(pud, vaddr); + return pmd_offset(pud, vaddr); +} + +static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +{ if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); + pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); - if (pte != pte_offset_kernel(pmd, 0)) { + if (pte != pte_offset_kernel(pmd, 0)) printk(KERN_ERR "PAGETABLE BUG #02!\n"); - return; - } } + return pte_offset_kernel(pmd, vaddr); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pud = pud_page + pud_index(vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); set_pte(pte, new_pte); /* @@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } -void -set_pte_vaddr(unsigned long vaddr, pte_t pteval) +void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; pud_t *pud_page; @@ -223,23 +239,22 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval) set_pte_vaddr_pud(pud_page, vaddr, pteval); } -void __init populate_extra_pte(unsigned long vaddr) +pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) { - printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); - return; - } - } + pud = fill_pud(pgd, vaddr); + return fill_pmd(pud, vaddr); +} + +pte_t * __init populate_extra_pte(unsigned long vaddr) +{ + pmd_t *pmd; - set_pte_vaddr_pud((pud_t *)pgd_page_vaddr(*pgd), vaddr, __pte(0)); + pmd = populate_extra_pmd(vaddr); + return fill_pte(pmd, vaddr); } /* -- cgit v1.2.3 From 40823f737e5bd186a1156fb1c28f360260e1e084 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:26:18 +0100 Subject: x86: memtest: reuse test patterns when memtest parameter exceeds number of available patterns Impact: fix unexpected behaviour when pattern number is out of range Current implementation provides 4 patterns for memtest. The code doesn't check whether the memtest parameter value exceeds the maximum pattern number. Instead the memtest code pretends to test with non-existing patterns, e.g. when booting with memtest=10 I've observed the following ... early_memtest: pattern num 10 0000001000 - 0000006000 pattern 0 ... 0000001000 - 0000006000 pattern 1 ... 0000001000 - 0000006000 pattern 2 ... 0000001000 - 0000006000 pattern 3 ... 0000001000 - 0000006000 pattern 4 ... 0000001000 - 0000006000 pattern 5 ... 0000001000 - 0000006000 pattern 6 ... 0000001000 - 0000006000 pattern 7 ... 0000001000 - 0000006000 pattern 8 ... 0000001000 - 0000006000 pattern 9 ... But in fact Linux didn't test anything for patterns > 4 as the default case in memtest() is to leave the function. I suggest to use the memtest parameter as the number of tests to be performed and to re-iterate over all existing patterns. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9cab18b0b85..00b8bdc64c3 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,6 +9,8 @@ #include +#define _MAX_MEM_PATTERNS 4 + static void __init memtest(unsigned long start_phys, unsigned long size, unsigned pattern) { @@ -21,6 +23,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, unsigned long count; unsigned long incr; + pattern = pattern % _MAX_MEM_PATTERNS; + switch (pattern) { case 0: val = 0UL; @@ -110,8 +114,9 @@ void __init early_memtest(unsigned long start, unsigned long end) t_size = end - t_start; printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, pattern); + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, + pattern % _MAX_MEM_PATTERNS); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 6d74171bf7315257d276aa35400c5a8d6a993f19 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:27:27 +0100 Subject: x86: memtest: introduce array to select memtest patterns Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 65 ++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 00b8bdc64c3..827f94044cf 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -9,48 +9,25 @@ #include -#define _MAX_MEM_PATTERNS 4 +static u64 patterns[] __initdata = { + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, +}; static void __init memtest(unsigned long start_phys, unsigned long size, - unsigned pattern) + u64 pattern) { unsigned long i; - unsigned long *start; + u64 *start; unsigned long start_bad; unsigned long last_bad; - unsigned long val; unsigned long start_phys_aligned; unsigned long count; unsigned long incr; - pattern = pattern % _MAX_MEM_PATTERNS; - - switch (pattern) { - case 0: - val = 0UL; - break; - case 1: - val = -1UL; - break; - case 2: -#ifdef CONFIG_X86_64 - val = 0x5555555555555555UL; -#else - val = 0x55555555UL; -#endif - break; - case 3: -#ifdef CONFIG_X86_64 - val = 0xaaaaaaaaaaaaaaaaUL; -#else - val = 0xaaaaaaaaUL; -#endif - break; - default: - return; - } - - incr = sizeof(unsigned long); + incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); count = (size - (start_phys_aligned - start_phys))/incr; start = __va(start_phys_aligned); @@ -58,15 +35,16 @@ static void __init memtest(unsigned long start_phys, unsigned long size, last_bad = 0; for (i = 0; i < count; i++) - start[i] = val; + start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != val) { + if (*start != pattern) { if (start_phys_aligned == last_bad + incr) { last_bad += incr; } else { if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, + start_bad, last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } start_bad = last_bad = start_phys_aligned; @@ -74,8 +52,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size, } } if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", - val, start_bad, last_bad + incr); + printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, + last_bad + incr); reserve_early(start_bad, last_bad + incr, "BAD RAM"); } } @@ -95,13 +74,16 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { u64 t_start, t_size; - unsigned pattern; + unsigned int i; + u64 pattern; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); - for (pattern = 0; pattern < memtest_pattern; pattern++) { + for (i = 0; i < memtest_pattern; i++) { + unsigned int idx = i % ARRAY_SIZE(patterns); + pattern = patterns[idx]; t_start = start; t_size = 0; while (t_start < end) { @@ -115,8 +97,7 @@ void __init early_memtest(unsigned long start, unsigned long end) printk(KERN_CONT "\n %010llx - %010llx pattern %d", (unsigned long long)t_start, - (unsigned long long)t_start + t_size, - pattern % _MAX_MEM_PATTERNS); + (unsigned long long)t_start + t_size, idx); memtest(t_start, t_size, pattern); -- cgit v1.2.3 From 7dad169e57eda1f0aa6dc5ac43a898b4b0ced2c7 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:07 +0100 Subject: x86: memtest: cleanup memtest function Impact: code cleanup Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 827f94044cf..01a72d6825b 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,6 +16,15 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; +static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, + unsigned long end_bad) +{ + printk(KERN_CONT "\n %016llx bad mem addr " + "%010lx - %010lx reserved", + (unsigned long long) pattern, start_bad, end_bad); + reserve_early(start_bad, end_bad, "BAD RAM"); +} + static void __init memtest(unsigned long start_phys, unsigned long size, u64 pattern) { @@ -37,26 +46,18 @@ static void __init memtest(unsigned long start_phys, unsigned long size, for (i = 0; i < count; i++) start[i] = pattern; for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != pattern) { - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - } else { - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, - start_bad, last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } - start_bad = last_bad = start_phys_aligned; - } + if (*start == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; } - if (start_bad) { - printk(KERN_CONT "\n %016llx bad mem addr %010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, - last_bad + incr); - reserve_early(start_bad, last_bad + incr, "BAD RAM"); - } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); } /* default is disabled */ -- cgit v1.2.3 From 570c9e69aaa84689fb8ed2a3a4af39ca54ba7a47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:28:58 +0100 Subject: x86: memtest: adapt log messages - print test pattern instead of pattern number, - show pattern as stored in memory, - use proper priority flags, - consistent use of u64 throughout the code Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 01a72d6825b..3232397783a 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -16,25 +16,22 @@ static u64 patterns[] __initdata = { 0xaaaaaaaaaaaaaaaaULL, }; -static void __init reserve_bad_mem(u64 pattern, unsigned long start_bad, - unsigned long end_bad) +static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) { - printk(KERN_CONT "\n %016llx bad mem addr " - "%010lx - %010lx reserved", - (unsigned long long) pattern, start_bad, end_bad); + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); reserve_early(start_bad, end_bad, "BAD RAM"); } -static void __init memtest(unsigned long start_phys, unsigned long size, - u64 pattern) +static void __init memtest(u64 pattern, u64 start_phys, u64 size) { - unsigned long i; + u64 i, count; u64 *start; - unsigned long start_bad; - unsigned long last_bad; - unsigned long start_phys_aligned; - unsigned long count; - unsigned long incr; + u64 start_bad, last_bad; + u64 start_phys_aligned; + size_t incr; incr = sizeof(pattern); start_phys_aligned = ALIGN(start_phys, incr); @@ -81,7 +78,7 @@ void __init early_memtest(unsigned long start, unsigned long end) if (!memtest_pattern) return; - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { unsigned int idx = i % ARRAY_SIZE(patterns); pattern = patterns[idx]; @@ -96,14 +93,13 @@ void __init early_memtest(unsigned long start, unsigned long end) if (t_start + t_size > end) t_size = end - t_start; - printk(KERN_CONT "\n %010llx - %010llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, idx); - - memtest(t_start, t_size, pattern); + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) t_start, + (unsigned long long) t_start + t_size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, t_start, t_size); t_start += t_size; } } - printk(KERN_CONT "\n"); } -- cgit v1.2.3 From bfb4dc0da45f8fddc76eba7e62919420c7d6dae2 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:30:04 +0100 Subject: x86: memtest: wipe out test pattern from memory Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 56 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 3232397783a..9c52ef19e6f 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -57,6 +57,29 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) reserve_bad_mem(pattern, start_bad, last_bad + incr); } +static void __init do_one_pass(u64 pattern, u64 start, u64 end) +{ + u64 size = 0; + + while (start < end) { + start = find_e820_area_size(start, &size, 1); + + /* done ? */ + if (start >= end) + break; + if (start + size > end) + size = end - start; + + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long) start, + (unsigned long long) start + size, + (unsigned long long) cpu_to_be64(pattern)); + memtest(pattern, start, size); + + start += size; + } +} + /* default is disabled */ static int memtest_pattern __initdata; @@ -71,35 +94,22 @@ early_param("memtest", parse_memtest); void __init early_memtest(unsigned long start, unsigned long end) { - u64 t_start, t_size; unsigned int i; - u64 pattern; + unsigned int idx = 0; if (!memtest_pattern) return; printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); for (i = 0; i < memtest_pattern; i++) { - unsigned int idx = i % ARRAY_SIZE(patterns); - pattern = patterns[idx]; - t_start = start; - t_size = 0; - while (t_start < end) { - t_start = find_e820_area_size(t_start, &t_size, 1); - - /* done ? */ - if (t_start >= end) - break; - if (t_start + t_size > end) - t_size = end - t_start; - - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long) t_start, - (unsigned long long) t_start + t_size, - (unsigned long long) cpu_to_be64(pattern)); - memtest(pattern, t_start, t_size); - - t_start += t_size; - } + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } + + if (idx > 0) { + printk(KERN_INFO "early_memtest: wipe out " + "test pattern from memory\n"); + /* additional test with pattern 0 will do this */ + do_one_pass(0, start, end); } } -- cgit v1.2.3 From 63823126c221dd721ce7351b596b3b73aa943613 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 25 Feb 2009 11:31:49 +0100 Subject: x86: memtest: add additional (regular) test patterns Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 9c52ef19e6f..0bcd7883d03 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -14,6 +14,19 @@ static u64 patterns[] __initdata = { 0xffffffffffffffffULL, 0x5555555555555555ULL, 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ }; static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) -- cgit v1.2.3 From 7880f7464546842ee14179bef16a6e14381ea638 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:13 -0800 Subject: gpu/drm, x86, PAT: routine to keep identity map in sync Add a function to check and keep identity maps in sync, when changing any memory type. One of the follow on patches will also use this routine. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index aebbf67a79d..399383c6f61 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -624,6 +624,33 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) +{ + unsigned long id_sz; + + if (!pat_enabled || base >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < base + size) ? + __pa(high_memory) - base : + size; + + if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { + printk(KERN_INFO + "%s:%d ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + base, (unsigned long long)(base + size)); + return -EINVAL; + } + return 0; +} + /* * Internal interface to reserve a range of physical memory with prot. * Reserved non RAM regions only and after successful reserve_memtype, @@ -633,7 +660,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; - int id_sz, ret; + int ret; unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); @@ -670,23 +697,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags); } - /* Need to keep identity mapping in sync */ - if (paddr >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < paddr + size) ? - __pa(high_memory) - paddr : - size; - - if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, flags) < 0) { free_memtype(paddr, paddr + size); - printk(KERN_ERR - "%s:%d reserve_pfn_range ioremap_change_attr failed %s " - "for %Lx-%Lx\n", - current->comm, current->pid, - cattr_name(flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size)); return -EINVAL; } return 0; -- cgit v1.2.3 From 17581ad812a9abb0182260374ef2e52d4a808a64 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 24 Feb 2009 17:35:14 -0800 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_* Make io_mapping_create_wc and io_mapping_free go through PAT to make sure that there are no memory type aliases. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Cc: Dave Airlie Cc: Jesse Barnes Cc: Eric Anholt Cc: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 6c2b1af1692..94596f794b1 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -int +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,6 +38,44 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif +int +reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) +{ + unsigned long ret_flag; + + if (!is_io_mapping_possible(base, size)) + goto out_err; + + if (!pat_enabled) { + *prot = pgprot_noncached(PAGE_KERNEL); + return 0; + } + + if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) + goto out_err; + + if (ret_flag == _PAGE_CACHE_WB) + goto out_free; + + if (kernel_map_sync_memtype(base, size, ret_flag)) + goto out_free; + + *prot = __pgprot(__PAGE_KERNEL | ret_flag); + return 0; + +out_free: + free_memtype(base, base + size); +out_err: + return -EINVAL; +} + +void +free_io_memtype(u64 base, unsigned long size) +{ + if (pat_enabled) + free_memtype(base, base + size); +} + /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * -- cgit v1.2.3 From 13093cb0e59053bf97910de3a24f07cdff71c62c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 03:16:47 +0100 Subject: gpu/drm, x86, PAT: PAT support for io_mapping_*, export symbols for modules Impact: build fix ERROR: "reserve_io_memtype_wc" [drivers/gpu/drm/i915/i915.ko] undefined! ERROR: "free_io_memtype" [drivers/gpu/drm/i915/i915.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 94596f794b1..d5e28424622 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -68,6 +68,7 @@ out_free: out_err: return -EINVAL; } +EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); void free_io_memtype(u64 base, unsigned long size) @@ -75,6 +76,7 @@ free_io_memtype(u64 base, unsigned long size) if (pat_enabled) free_memtype(base, base + size); } +EXPORT_SYMBOL_GPL(free_io_memtype); /* Map 'pfn' using fixed map 'type' and protections 'prot' */ -- cgit v1.2.3 From fd862dde18c3e360f510780e1d1bf615866b11c2 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Sun, 15 Feb 2009 21:48:54 -0300 Subject: x86, fixmap: define reserve_top_address for x86_64 Impact: new interface (not yet use) Define reserve_top_address for x86_64; only for later x86 integration. Signed-off-by: Gustavo F. Padovan Acked-by: Glauber Costa Signed-off-by: H. Peter Anvin --- arch/x86/mm/pgtable.c | 18 ++++++++++++++++++ arch/x86/mm/pgtable_32.c | 16 ---------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 86f2ffc43c3..5b7c7c8464f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -313,6 +313,24 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, return young; } +/** + * reserve_top_address - reserves a hole in the top of kernel address space + * @reserve - size of hole to reserve + * + * Can be used to relocate the fixmap area and poke a hole in the top + * of kernel address space to make room for a hypervisor. + */ +void __init reserve_top_address(unsigned long reserve) +{ +#ifdef CONFIG_X86_32 + BUG_ON(fixmaps_set > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); + __FIXADDR_TOP = -reserve - PAGE_SIZE; + __VMALLOC_RESERVE += reserve; +#endif +} + int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..c3cf6e11576 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -97,22 +97,6 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) unsigned long __FIXADDR_TOP = 0xfffff000; EXPORT_SYMBOL(__FIXADDR_TOP); -/** - * reserve_top_address - reserves a hole in the top of kernel address space - * @reserve - size of hole to reserve - * - * Can be used to relocate the fixmap area and poke a hole in the top - * of kernel address space to make room for a hypervisor. - */ -void __init reserve_top_address(unsigned long reserve) -{ - BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; - __VMALLOC_RESERVE += reserve; -} - /* * vmalloc=size forces the vmalloc area to be exactly 'size' * bytes. This can be used to increase (or decrease) the -- cgit v1.2.3 From f5c1aa1537be39d8b9bb5279b5881d81898fd3cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 1 Mar 2009 12:32:08 +0100 Subject: Revert "gpu/drm, x86, PAT: PAT support for io_mapping_*" This reverts commit 17581ad812a9abb0182260374ef2e52d4a808a64. Sitsofe Wheeler reported that /dev/dri/card0 is MIA on his EeePC 900 and bisected it to this commit. Graphics card is an i915 in an EeePC 900: 00:02.0 VGA compatible controller [0300]: Intel Corporation Mobile 915GM/GMS/910GML Express Graphics Controller [8086:2592] (rev 04) ( Most likely the ioremap() of the driver failed and hence the card did not initialize. ) Reported-by: Sitsofe Wheeler Bisected-by: Sitsofe Wheeler Cc: Venkatesh Pallipadi Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 44 ++------------------------------------------ 1 file changed, 2 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index d5e28424622..6c2b1af1692 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,13 +21,13 @@ #include #ifdef CONFIG_X86_PAE -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { return 1; } #else -static int +int is_io_mapping_possible(resource_size_t base, unsigned long size) { /* There is no way to map greater than 1 << 32 address without PAE */ @@ -38,46 +38,6 @@ is_io_mapping_possible(resource_size_t base, unsigned long size) } #endif -int -reserve_io_memtype_wc(u64 base, unsigned long size, pgprot_t *prot) -{ - unsigned long ret_flag; - - if (!is_io_mapping_possible(base, size)) - goto out_err; - - if (!pat_enabled) { - *prot = pgprot_noncached(PAGE_KERNEL); - return 0; - } - - if (reserve_memtype(base, base + size, _PAGE_CACHE_WC, &ret_flag)) - goto out_err; - - if (ret_flag == _PAGE_CACHE_WB) - goto out_free; - - if (kernel_map_sync_memtype(base, size, ret_flag)) - goto out_free; - - *prot = __pgprot(__PAGE_KERNEL | ret_flag); - return 0; - -out_free: - free_memtype(base, base + size); -out_err: - return -EINVAL; -} -EXPORT_SYMBOL_GPL(reserve_io_memtype_wc); - -void -free_io_memtype(u64 base, unsigned long size) -{ - if (pat_enabled) - free_memtype(base, base + size); -} -EXPORT_SYMBOL_GPL(free_io_memtype); - /* Map 'pfn' using fixed map 'type' and protections 'prot' */ void * -- cgit v1.2.3 From 2b688dfd0a93cf3b17c38feef693361da47b0606 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:04 +0200 Subject: x86: move __VMALLOC_RESERVE to pgtable_32.c Impact: cleanup The __VMALLOC_RESERVE global variable is not used in init_32.c. Move that to pgtable_32.c to reduce the diff between init_32.c and init_64.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236077704.2675.4.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 -- arch/x86/mm/pgtable_32.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 06708ee94aa..5b06a2f5dea 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,8 +50,6 @@ #include #include -unsigned int __VMALLOC_RESERVE = 128 << 20; - unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0951db9ee51..e4032c886c3 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,6 +20,8 @@ #include #include +unsigned int __VMALLOC_RESERVE = 128 << 20; + /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. -- cgit v1.2.3 From fd578f9c0a0a7bf3e460e6f21cdc6f4018949e80 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 12:55:05 +0200 Subject: x86: use roundup() instead of PAGE_ALIGN() in find_early_table_space() Impact: cleanup This patch changes find_early_table_space() to use roundup() for rounding up tables to page size to unify the common parts of the 32-bit and 64-bit implementations. Signed-off-by: Pekka Enberg LKML-Reference: <1236077705.2675.6.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5b06a2f5dea..1dd6b6334dc 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -845,10 +845,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = PAGE_ALIGN(puds * sizeof(pud_t)); + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { unsigned long extra; @@ -859,10 +859,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += PAGE_ALIGN(ptes * sizeof(pte_t)); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* for fixmap */ - tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t)); + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.2.3 From 05f209e7b936a48e341d36831079116a06658ccc Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:02 +0200 Subject: x86: add sanity checks to init_32.c Impact: unification This patch adds sanity checks that are already in init_64.c to init_32.c. Signed-off-by: Pekka Enberg LKML-Reference: <1236078902.2675.16.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1dd6b6334dc..1570a822c18 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1214,18 +1214,21 @@ void mark_rodata_ro(void) void free_init_pages(char *what, unsigned long begin, unsigned long end) { -#ifdef CONFIG_DEBUG_PAGEALLOC + unsigned long addr = begin; + + if (addr >= end) + return; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ +#ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else - unsigned long addr; - /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that @@ -1233,14 +1236,16 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - for (addr = begin; addr < end; addr += PAGE_SIZE) { + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); #endif } -- cgit v1.2.3 From e087edd8c056292191bb989baf49f83ee509e624 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:04 +0200 Subject: x86: make sure initmem is writable on 64-bit Impact: unification This patch ports commit 3c1df68b848b39270752ff8d4b956cc4a4dce0f6 ("x86: make sure initmem is writable") to the 64-bit version to unify implementations of free_init_pages(). Signed-off-by: Pekka Enberg Cc: Arjan van de Ven LKML-Reference: <1236078904.2675.17.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b49025..03da9030d0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -962,6 +962,13 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) begin, PAGE_ALIGN(end)); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (; addr < end; addr += PAGE_SIZE) { -- cgit v1.2.3 From e5b2bb552706ca0e30795ee84caacbb37cec5705 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 13:15:06 +0200 Subject: x86: unify free_init_pages() and free_initmem() Impact: unification This patch introduces a common arch/x86/mm/init.c and moves the identical free_init_pages() and free_initmem() functions to the file. Signed-off-by: Pekka Enberg LKML-Reference: <1236078906.2675.18.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/init.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 44 -------------------------------------------- arch/x86/mm/init_64.c | 44 -------------------------------------------- 4 files changed, 50 insertions(+), 89 deletions(-) create mode 100644 arch/x86/mm/init.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2b938a38491..08537747cb5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,4 +1,4 @@ -obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ +obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o gup.o obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c new file mode 100644 index 00000000000..ce6a722587d --- /dev/null +++ b/arch/x86/mm/init.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include + +void free_init_pages(char *what, unsigned long begin, unsigned long end) +{ + unsigned long addr = begin; + + if (addr >= end) + return; + + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + + for (; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); + free_page(addr); + totalram_pages++; + } +#endif +} + +void free_initmem(void) +{ + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1570a822c18..cd8d6732613 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1212,50 +1212,6 @@ void mark_rodata_ro(void) } #endif -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 03da9030d0e..aae87456d93 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -945,50 +945,6 @@ void __init mem_init(void) initsize >> 10); } -void free_init_pages(char *what, unsigned long begin, unsigned long end) -{ - unsigned long addr = begin; - - if (addr >= end) - return; - - /* - * If debugging page accesses then do not free this memory but - * mark them not present - any buggy init-section access will - * create a kernel page fault: - */ -#ifdef CONFIG_DEBUG_PAGEALLOC - printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); - set_memory_np(begin, (end - begin) >> PAGE_SHIFT); -#else - /* - * We just marked the kernel text read only above, now that - * we are going to free part of that, we need to make that - * writeable first. - */ - set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); - - printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); - - for (; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } -#endif -} - -void free_initmem(void) -{ - free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); -} - #ifdef CONFIG_DEBUG_RODATA const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -- cgit v1.2.3 From 867c5b5292583b1e474cbbcb4c77f09bfca3903c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 3 Mar 2009 14:10:12 +0200 Subject: x86: set_highmem_pages_init() cleanup Impact: cleanup This patch moves set_highmem_pages_init() to arch/x86/mm/highmem_32.c. The declaration of the function is kept in asm/numa_32.h because asm/highmem.h is included only if CONFIG_HIGHMEM is enabled so we can't put the empty static inline function there. Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 12 ------------ arch/x86/mm/numa_32.c | 26 -------------------------- 3 files changed, 34 insertions(+), 38 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index bcc079c282d..13a823cf564 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,5 +1,6 @@ #include #include +#include /* for totalram_pages */ void *kmap(struct page *page) { @@ -156,3 +157,36 @@ EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); + +#ifdef CONFIG_NUMA +void __init set_highmem_pages_init(void) +{ + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +} +#else +static void __init set_highmem_pages_init(void) +{ + add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); + + totalram_pages += totalhigh_pages; +} +#endif /* CONFIG_NUMA */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cd8d6732613..0b087dcd2c1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -467,22 +467,10 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, work_with_active_regions(nid, add_highpages_work_fn, &data); } -#ifndef CONFIG_NUMA -static void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* !CONFIG_NUMA */ - #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { } -static inline void set_highmem_pages_init(void) -{ -} #endif /* CONFIG_HIGHMEM */ void __init native_pagetable_setup_start(pgd_t *base) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173..a04092a8acc 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -423,32 +423,6 @@ void __init initmem_init(unsigned long start_pfn, setup_bootmem_allocator(); } -void __init set_highmem_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int nid; - - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } - totalram_pages += totalhigh_pages; -#endif -} - #ifdef CONFIG_MEMORY_HOTPLUG static int paddr_to_nid(u64 addr) { -- cgit v1.2.3 From 03787ceed8f7bf06af29f3b213017d89f8e9423d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 3 Mar 2009 15:32:24 +0100 Subject: x86: set_highmem_pages_init() cleanup, fix !CONFIG_NUMA && CONFIG_HIGHMEM=y Impact: build fix arch/x86/mm/highmem_32.c:187: error: static declaration of 'set_highmem_pages_init' follows non-static declaration arch/x86/include/asm/numa_32.h:8: error: previous declaration of 'set_highmem_pages_init' was here Signed-off-by: Pekka Enberg LKML-Reference: <1236082212.2675.24.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 13a823cf564..00f127c80b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -183,7 +183,7 @@ void __init set_highmem_pages_init(void) totalram_pages += totalhigh_pages; } #else -static void __init set_highmem_pages_init(void) +void __init set_highmem_pages_init(void) { add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); -- cgit v1.2.3 From f254f3909efaf59ca2d0f408de2d044dace60706 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 3 Mar 2009 12:02:57 -0800 Subject: x86: un-__init fill_pud/pmd/pte They are used by __set_fixmap->set_pte_vaddr_pud, which can be used by arch_setup_additional_pages(), and so is used after init. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 11981fc8570..07f44d491df 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -168,7 +168,7 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { pud_t *pud = (pud_t *)spp_getpage(); @@ -180,7 +180,7 @@ static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr) return pud_offset(pgd, vaddr); } -static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) { if (pud_none(*pud)) { pmd_t *pmd = (pmd_t *) spp_getpage(); @@ -192,7 +192,7 @@ static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr) return pmd_offset(pud, vaddr); } -static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr) +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) { if (pmd_none(*pmd)) { pte_t *pte = (pte_t *) spp_getpage(); -- cgit v1.2.3 From 540aca06b737cc38965b52eeceefba3d24376461 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 4 Mar 2009 11:46:40 +0200 Subject: x86: move devmem_is_allowed() to common mm/init.c Impact: cleanup The function is identical on 32-bit and 64-bit configurations so move it to the common mm/init.c file. Signed-off-by: Pekka Enberg LKML-Reference: <1236160001.29024.29.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 24 ++++++++++++++++++++++++ arch/x86/mm/init_32.c | 21 --------------------- arch/x86/mm/init_64.c | 22 ---------------------- 3 files changed, 24 insertions(+), 43 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ce6a722587d..f89df52683c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,9 +1,33 @@ +#include #include + #include #include +#include #include #include +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr = begin; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0b087dcd2c1..917c4e60c76 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -354,27 +354,6 @@ repeat: } } -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - pte_t *kmap_pte; pgprot_t kmap_prot; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 724e537432e..074435e7982 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -876,28 +876,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * devmem_is_allowed() checks to see if /dev/mem access to a certain address - * is valid. The argument is a physical page number. - * - * - * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. - * Access has to be given to non-kernel-ram areas as well, these contain the PCI - * mmio resources as well as potential bios/acpi data regions. - */ -int devmem_is_allowed(unsigned long pagenr) -{ - if (pagenr <= 256) - return 1; - if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) - return 0; - if (!page_is_ram(pagenr)) - return 1; - return 0; -} - - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; -- cgit v1.2.3 From 6298e719cf388f43b674f43799af467d3e4e5aa7 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 4 Mar 2009 10:16:07 +0200 Subject: x86: set_highmem_pages_init() cleanup, #2 Impact: cleanup The zones are set up at this stage so there's a highmem zone available even for the UMA case. The only difference there is that for machines that have CONFIG_HIGHMEM enabled but don't have any highmem available, ->zone_start_pfn is zero whereas highstart_pfn is non-zero). The field is left zeroed because of the !size test in free_area_init_core() but shouldn't be a problem because add_highpages_with_active_regions() handles empty ranges just fine. Signed-off-by: Pekka Enberg Cc: Mel Gorman LKML-Reference: <1236154567.29024.23.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0..d11745334a6 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); -#ifdef CONFIG_NUMA void __init set_highmem_pages_init(void) { struct zone *zone; @@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void) } totalram_pages += totalhigh_pages; } -#else -void __init set_highmem_pages_init(void) -{ - add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); - - totalram_pages += totalhigh_pages; -} -#endif /* CONFIG_NUMA */ -- cgit v1.2.3 From a71edd1f46c8a599509bda478fb4eea27fb0da63 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:22:35 -0800 Subject: x86: fix bootmem cross node for 32bit numa Impact: fix panic on system 2g x4 sockets Found one system with 4 sockets and every sockets has 2g can not boot with numa32 because boot mem is crossing nodes. So try to have numa version of setup_bootmem_allocator(). Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE485B.8000902@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 46 ++++++++++++++++++++++++++++++++++++++++------ arch/x86/mm/numa_32.c | 5 +++-- 2 files changed, 43 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 917c4e60c76..67bdb59d4e1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -776,9 +776,37 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +#ifdef CONFIG_NEED_MULTIPLE_NODES +static unsigned long __init setup_node_bootmem(int nodeid, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long bootmap) +{ + unsigned long bootmap_size; + + if (start_pfn > max_low_pfn) + return bootmap; + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + /* don't touch min_low_pfn */ + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), + bootmap >> PAGE_SHIFT, + start_pfn, end_pfn); + printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", + nodeid, start_pfn<> PAGE_SHIFT, - min_low_pfn, max_low_pfn); printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<> PAGE_SHIFT, + min_low_pfn, max_low_pfn); printk(KERN_INFO " bootmap %08lx - %08lx\n", bootmap, bootmap + bootmap_size); - for_each_online_node(i) - free_bootmem_with_active_regions(i, max_low_pfn); + free_bootmem_with_active_regions(0, max_low_pfn); early_res_to_bootmem(0, max_low_pfn<bdata = &bootmem_node_data[nid]; + } - NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } -- cgit v1.2.3 From b68adb16f29c8ea02f21f5ebf65bcabffe217e9f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 01:24:04 -0800 Subject: x86: make 32-bit init_memory_mapping range change more like 64-bit Impact: cleanup make code more readable and more like 64-bit Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AE48B4.8010907@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 126 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 94 insertions(+), 32 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 67bdb59d4e1..37aeaf366d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -885,29 +885,55 @@ static void __init find_early_table_space(unsigned long end, int use_pse) (table_start << PAGE_SHIFT) + tables); } +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#define NR_RANGE_MR 3 + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; - end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); - } else { - /* head is not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + /* head could not be big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (end_pfn > (end>>PAGE_SHIFT)) + end_pfn = end>>PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; } - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); /* big page range */ - start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < (big_page_start >> PAGE_SHIFT)) - start_pfn = big_page_start >> PAGE_SHIFT; end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - use_pse); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1< (big_page_start>>PAGE_SHIFT)) { - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) - kernel_physical_mapping_init(pgd_base, start_pfn, - end_pfn, 0); + start_pfn = pos>>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + if (start_pfn < end_pfn) + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; } + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %08lx - %08lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1< Date: Wed, 4 Mar 2009 11:13:40 +0200 Subject: x86: move free_initrd_mem() to common mm/init.c Impact: cleanup The function is identical on 32-bit and 64-bit configurations so move it to the common mm/init.c file. Signed-off-by: Pekka Enberg LKML-Reference: <1236158020.29024.28.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 7 +++++++ arch/x86/mm/init_32.c | 7 ------- arch/x86/mm/init_64.c | 7 ------- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f89df52683c..cc7fe660f33 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -71,3 +71,10 @@ void free_initmem(void) (unsigned long)(&__init_begin), (unsigned long)(&__init_end)); } + +#ifdef CONFIG_BLK_DEV_INITRD +void free_initrd_mem(unsigned long start, unsigned long end) +{ + free_init_pages("initrd memory", start, end); +} +#endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 37aeaf366d5..c69c6b1f5e5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1275,13 +1275,6 @@ void mark_rodata_ro(void) } #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 074435e7982..d325186dd32 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -963,13 +963,6 @@ void mark_rodata_ro(void) #endif -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_init_pages("initrd memory", start, end); -} -#endif - int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { -- cgit v1.2.3 From fc5efe3941c47c0278fe1bbcf8cc02a03a74fcda Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Mar 2009 12:21:24 -0800 Subject: x86: fix bootmem cross node for 32bit numa, cleanup Impact: clean up Simplify the code, reuse some lines. Remove min_low_pfn reference, it is always 0 Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49AEE2C4.2030602@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c69c6b1f5e5..c351456d06d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -776,7 +776,6 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } -#ifdef CONFIG_NEED_MULTIPLE_NODES static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, @@ -802,7 +801,6 @@ static unsigned long __init setup_node_bootmem(int nodeid, return bootmap + bootmap_size; } -#endif void __init setup_bootmem_allocator(void) { @@ -812,8 +810,7 @@ void __init setup_bootmem_allocator(void) * Initialize the boot-time allocator (with low memory only): */ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<> PAGE_SHIFT, - min_low_pfn, max_low_pfn); - printk(KERN_INFO " bootmap %08lx - %08lx\n", - bootmap, bootmap + bootmap_size); - free_bootmem_with_active_regions(0, max_low_pfn); - early_res_to_bootmem(0, max_low_pfn< Date: Thu, 5 Mar 2009 14:54:52 +0200 Subject: x86: init_memory_mapping() trivial cleanups Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), fix up all trivial issues. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-1-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 42 +++++++++++++++++++++++++----------------- arch/x86/mm/init_64.c | 26 +++++++++++++++----------- 2 files changed, 40 insertions(+), 28 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c351456d06d..ad4e03c2d4d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -868,11 +868,10 @@ static void __init find_early_table_space(unsigned long end, int use_pse) table_start >>= PAGE_SHIFT; table_end = table_start; - table_top = table_start + (tables>>PAGE_SHIFT); + table_top = table_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, - (table_start << PAGE_SHIFT) + tables); + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); } struct map_range { @@ -899,8 +898,13 @@ static int save_mr(struct map_range *mr, int nr_range, return nr_range; } +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ unsigned long __init_refok init_memory_mapping(unsigned long start, - unsigned long end) + unsigned long end) { pgd_t *pgd_base = swapper_pg_dir; unsigned long page_size_mask = 0; @@ -911,7 +915,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, int nr_range, i; int use_pse; - printk(KERN_INFO "init_memory_mapping: %08lx-%08lx\n", start, end); + printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); #ifdef CONFIG_DEBUG_PAGEALLOC /* @@ -940,19 +944,19 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __supported_pte_mask |= _PAGE_GLOBAL; } - memset(mr, 0, sizeof(mr)); - nr_range = 0; - if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; + memset(mr, 0, sizeof(mr)); + nr_range = 0; + /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ - /* head could not be big page alignment ? */ + /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; if (pos == 0) @@ -960,14 +964,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, else end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (end_pfn > (end>>PAGE_SHIFT)) - end_pfn = end>>PAGE_SHIFT; + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pos = end_pfn << PAGE_SHIFT; } - /* big page range */ + /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); @@ -977,7 +981,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pos = end_pfn << PAGE_SHIFT; } - /* tail is not big page alignment ? */ + /* tail is not big page (2M) alignment */ start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; if (start_pfn < end_pfn) @@ -998,13 +1002,17 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %08lx - %08lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + if (use_gbpages) { unsigned long extra; + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { unsigned long extra; + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* @@ -647,7 +652,6 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, unsigned long page_size_mask) { - if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) panic("run out of range for init_memory_mapping\n"); @@ -679,13 +683,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); - /* - * Find space for the kernel direct mapping tables. - * - * Later we should allocate these tables in the local node of the - * memory mapped. Unfortunately this is done currently before the - * nodes are discovered. - */ if (!after_bootmem) init_gbpages(); @@ -709,7 +706,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = 0; - /* head if not big page alignment ?*/ + /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) @@ -721,7 +718,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pos = end_pfn << PAGE_SHIFT; } - /* big page (2M) range*/ + /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) @@ -769,7 +766,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* move it */ old_start = mr[i].start; memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof (struct map_range)); + (nr_range - 1 - i) * sizeof(struct map_range)); mr[i--].start = old_start; nr_range--; } @@ -780,6 +777,13 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Thu, 5 Mar 2009 14:54:53 +0200 Subject: x86: add gbpages support to 32-bit init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), add gbpages support to the 32-bit version. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-2-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ad4e03c2d4d..5fad0f95d5a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -65,6 +65,8 @@ static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; +int direct_gbpages; + static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; @@ -831,14 +833,22 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -static void __init find_early_table_space(unsigned long end, int use_pse) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (use_pse) { @@ -913,7 +923,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, struct map_range mr[NR_RANGE_MR]; int nr_range, i; - int use_pse; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); @@ -923,9 +933,10 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - use_pse = 0; + use_pse = use_gbpages = 0; #else use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; #endif #ifdef CONFIG_X86_PAE @@ -944,6 +955,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __supported_pte_mask |= _PAGE_GLOBAL; } + if (use_gbpages) + page_size_mask |= 1 << PG_LEVEL_1G; if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; @@ -1015,7 +1028,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_init_bootmem) - find_early_table_space(end, use_pse); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) kernel_physical_mapping_init(pgd_base, -- cgit v1.2.3 From 49a2bf7303b0dc5fccbb3ff7cf2e7751f0e3953d Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:54 +0200 Subject: x86: find_early_table_space() unification Impact: cleanup There are some minor differences between the 32-bit and 64-bit find_early_table_space() functions. This patch wraps those differences under CONFIG_X86_32 to make the function identical on both configurations. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-3-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 9 +++++++++ arch/x86/mm/init_64.c | 14 ++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5fad0f95d5a..86a99947455 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -855,24 +855,33 @@ static void __init find_early_table_space(unsigned long end, int use_pse, unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 extra += PMD_SIZE; +#endif ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +#ifdef CONFIG_X86_32 /* for fixmap */ tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables * need roughly 0.5KB per GB. */ +#ifdef CONFIG_X86_32 start = 0x7000; table_start = find_e820_area(start, max_pfn_mapped<>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + /* * RED-PEN putting page tables only on node 0 could * cause a hotspot and fill up ZONE_DMA. The page tables * need roughly 0.5KB per GB. */ +#ifdef CONFIG_X86_32 + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped< Date: Thu, 5 Mar 2009 14:54:55 +0200 Subject: x86: move pgd_base out of init_memory_mapping() Impact: cleanup This patch moves pgd_base out of init_memory_mapping() to reduce the diff between the 32-bit version and the 64-bit version of the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-4-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 86a99947455..cfc68d60138 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -227,11 +227,11 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base, - unsigned long start_pfn, +static void __init kernel_physical_mapping_init(unsigned long start_pfn, unsigned long end_pfn, int use_pse) { + pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; @@ -509,8 +509,9 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) +static void __init early_ioremap_page_table_range_init(void) { + pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; /* @@ -925,7 +926,6 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - pgd_t *pgd_base = swapper_pg_dir; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; @@ -1040,12 +1040,12 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init(pgd_base, + kernel_physical_mapping_init( mr[i].start >> PAGE_SHIFT, mr[i].end >> PAGE_SHIFT, mr[i].page_size_mask == (1< Date: Thu, 5 Mar 2009 14:54:56 +0200 Subject: x86: ifdef 32-bit specific setup in init_memory_mapping() Impact: cleanup Enabling NX, PSE, and PGE are only required on 32-bit so ifdef them in both versions of the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-5-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cfc68d60138..eb98cb90cb3 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -948,6 +948,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif +#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) @@ -963,6 +964,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } +#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 151e5ba3441..c3c0be5b637 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -712,6 +712,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_PAE + set_nx(); + if (nx_enabled) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#endif + + /* Enable PSE if available */ + if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); + + /* Enable PGE if available */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); + __supported_pte_mask |= _PAGE_GLOBAL; + } +#endif + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; if (use_pse) -- cgit v1.2.3 From 96083ca11bc85265c7ef9e791a57e3514d8f605a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:57 +0200 Subject: x86: remove unnecessary save_mr() sanity check Impact: cleanup The save_mr() function already checks that start_pfn is less than end_pfn so we can remove the unnecessary check which reduces the diff between the 32-bit and the 64-bit versions of init_memory_mapping(). Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-6-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index eb98cb90cb3..559715b488b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1008,8 +1008,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* tail is not big page (2M) alignment */ start_pfn = pos>>PAGE_SHIFT; end_pfn = end>>PAGE_SHIFT; - if (start_pfn < end_pfn) - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { -- cgit v1.2.3 From c464573cb3d3bdd45eed8f5f59596f84ede95a0c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:58 +0200 Subject: x86: rename after_init_bootmem to after_bootmem in mm/init_32.c Impact: cleanup This patch renames after_init_bootmem to after_bootmem in mm/init_32.c to reduce the diff to the 64-bit version of of init_memory_mapping(). Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-7-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 559715b488b..cc5c3992385 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -63,7 +63,7 @@ static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; -static int __initdata after_init_bootmem; +int after_bootmem; int direct_gbpages; @@ -92,7 +92,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { - if (after_init_bootmem) + if (after_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); @@ -119,7 +119,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = NULL; - if (after_init_bootmem) { + if (after_bootmem) { #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif @@ -158,7 +158,7 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, pte_t *newpte; int i; - BUG_ON(after_init_bootmem); + BUG_ON(after_bootmem); newpte = alloc_low_page(); for (i = 0; i < PTRS_PER_PTE; i++) set_pte(newpte + i, pte[i]); @@ -831,7 +831,7 @@ void __init setup_bootmem_allocator(void) bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap); #endif - after_init_bootmem = 1; + after_bootmem = 1; } static void __init find_early_table_space(unsigned long end, int use_pse, @@ -1037,7 +1037,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * memory mapped. Unfortunately this is done currently before the * nodes are discovered. */ - if (!after_init_bootmem) + if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) @@ -1052,11 +1052,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); - if (!after_init_bootmem) + if (!after_bootmem) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); - if (!after_init_bootmem) + if (!after_bootmem) early_memtest(start, end); return end >> PAGE_SHIFT; -- cgit v1.2.3 From cbba65796df99f3ca9bf70d14e5a19384c54b6a1 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:54:59 +0200 Subject: x86: unify kernel_physical_mapping_init() call in init_memory_mapping() Impact: cleanup The 64-bit version of init_memory_mapping() uses the last mapped address returned from kernel_physical_mapping_init() whereas the 32-bit version doesn't. This patch adds relevant ifdefs to both versions of the function to reduce the diff between them. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-8-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 10 +++++++++- arch/x86/mm/init_64.c | 21 +++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cc5c3992385..00c1d850825 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -929,6 +929,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; + unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; @@ -1040,11 +1041,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); +#ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) kernel_physical_mapping_init( mr[i].start >> PAGE_SHIFT, mr[i].end >> PAGE_SHIFT, mr[i].page_size_mask == (1<> PAGE_SHIFT; + return ret >> PAGE_SHIFT; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c3c0be5b637..e4fadea2e52 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -686,10 +686,10 @@ static int save_mr(struct map_range *mr, int nr_range, unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) { - unsigned long last_map_addr = 0; unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; unsigned long pos; + unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; @@ -819,10 +819,18 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) find_early_table_space(end, use_pse, use_gbpages); +#ifdef CONFIG_X86_32 + for (i = 0; i < nr_range; i++) + kernel_physical_mapping_init( + mr[i].start >> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1<> PAGE_SHIFT; + return ret >> PAGE_SHIFT; } #ifndef CONFIG_NUMA -- cgit v1.2.3 From d58e854e36ddf241ebc243e4122c5ab087bf38df Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:00 +0200 Subject: x86: add table start and end sanity checks to 32-bit init_memory_mapping() Impact: cleanup This patch adds a sanity check to the 32-bit version of init_memory_mapping() to reduce the diff to the 64-bit version. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-9-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 00c1d850825..0a3707fb973 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1060,7 +1060,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); - if (!after_bootmem) + if (!after_bootmem && table_end > table_start) reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); -- cgit v1.2.3 From 01ced9ec14ad1b4f8a533c2f2b5a4fe4c92c1099 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:01 +0200 Subject: x86: ifdef 32-bit and 64-bit setup in init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), ifdef configuration specific setup code in the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-10-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/init_64.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 0a3707fb973..3f91bdc2097 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1054,10 +1054,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, mr[i].page_size_mask); #endif +#ifdef CONFIG_X86_32 early_ioremap_page_table_range_init(); load_cr3(swapper_pg_dir); +#endif +#ifdef CONFIG_X86_64 + if (!after_bootmem) + mmu_cr4_features = read_cr4(); +#endif __flush_tlb_all(); if (!after_bootmem && table_end > table_start) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e4fadea2e52..5ecb23a57d2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -832,8 +832,16 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, mr[i].page_size_mask); #endif +#ifdef CONFIG_X86_32 + early_ioremap_page_table_range_init(); + + load_cr3(swapper_pg_dir); +#endif + +#ifdef CONFIG_X86_64 if (!after_bootmem) mmu_cr4_features = read_cr4(); +#endif __flush_tlb_all(); if (!after_bootmem && table_end > table_start) -- cgit v1.2.3 From c338d6f60fc29dfc74bd82b91526ef43ba992bab Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:02 +0200 Subject: x86: ifdef 32-bit and 64-bit pfn setup in init_memory_mapping() Impact: cleanup To reduce the diff between the 32-bit and 64-bit versions of init_memory_mapping(), ifdef configuration specific pfn setup code in the function. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-11-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 42 +++++++++++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 21 +++++++++++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3f91bdc2097..34760e48397 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -975,20 +975,25 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, memset(mr, 0, sizeof(mr)); nr_range = 0; + /* head if not big page alignment ? */ + start_pfn = start >> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ - /* head if not big page alignment ? */ - start_pfn = start >> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; if (pos == 0) end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); else end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { @@ -999,12 +1004,43 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5ecb23a57d2..d99bc6ac488 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -741,8 +741,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* head if not big page alignment ? */ start_pfn = start >> PAGE_SHIFT; pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#endif if (end_pfn > (end >> PAGE_SHIFT)) end_pfn = end >> PAGE_SHIFT; if (start_pfn < end_pfn) { @@ -753,16 +767,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, /* big page (2M) range */ start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); @@ -783,6 +803,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, page_size_mask & (1<>PAGE_SHIFT; -- cgit v1.2.3 From b47e3418c52b26f6143fc696326ae52a21324551 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:03 +0200 Subject: x86: ifdef 32-bit and 64-bit NR_RANGE_MR for save_mr() unification Impact: cleanup As a trivial preparation for moving common code to arc/x86/mm/init.c, ifdef the 32-bit and 64-bit versions of NR_RANGE_MR. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-12-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/init_64.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 34760e48397..f59e9b85163 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -900,7 +900,11 @@ struct map_range { unsigned page_size_mask; }; +#ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d99bc6ac488..d101990e463 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -660,7 +660,11 @@ struct map_range { unsigned page_size_mask; }; +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ #define NR_RANGE_MR 5 +#endif static int save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, -- cgit v1.2.3 From 0c0f756fd679d9747d52dad51fce3a5bb362eec3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:04 +0200 Subject: x86: add stub init_gbpages() for 32-bit init_memory_mapping() Impact: cleanup This patch adds an empty static inline init_gbpages() for the 32-bit version of init_memory_mapping() making both versions identical. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-13-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f59e9b85163..cd3c24b490a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -922,6 +922,10 @@ static int save_mr(struct map_range *mr, int nr_range, return nr_range; } +static inline void init_gbpages(void) +{ +} + /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from @@ -941,6 +945,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + if (!after_bootmem) + init_gbpages(); + #ifdef CONFIG_DEBUG_PAGEALLOC /* * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. -- cgit v1.2.3 From f765090a2617b8d9cb73b71e0aa850c29460d8be Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:05 +0200 Subject: x86: move init_memory_mapping() to common mm/init.c Impact: cleanup This patch moves the init_memory_mapping() function to common mm/init.c. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 328 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/init_32.c | 308 ++--------------------------------------------- arch/x86/mm/init_64.c | 314 ++--------------------------------------------- 3 files changed, 342 insertions(+), 608 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cc7fe660f33..3a21b136da2 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -2,10 +2,338 @@ #include #include +#include #include #include #include #include +#include + +#ifdef CONFIG_X86_32 +extern void __init early_ioremap_page_table_range_init(void); +extern void __init kernel_physical_mapping_init(unsigned long start_pfn, + unsigned long end_pfn, + int use_pse); +#endif + +#ifdef CONFIG_X86_64 +extern unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask); +#endif + +unsigned long __initdata table_start; +unsigned long __meminitdata table_end; +unsigned long __meminitdata table_top; + +int after_bootmem; + +int direct_gbpages +#ifdef CONFIG_DIRECT_GBPAGES + = 1 +#endif +; + +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) +{ + unsigned long puds, pmds, ptes, tables, start; + + puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + + if (use_gbpages) { + unsigned long extra; + + extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); + pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else + pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; + + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + + if (use_pse) { + unsigned long extra; + + extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); +#ifdef CONFIG_X86_32 + extra += PMD_SIZE; +#endif + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else + ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); + +#ifdef CONFIG_X86_32 + /* for fixmap */ + tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); +#endif + + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ +#ifdef CONFIG_X86_32 + start = 0x7000; + table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; + table_end = table_start; + table_top = table_start + (tables >> PAGE_SHIFT); + + printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); +} + +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +#ifdef CONFIG_X86_32 +#define NR_RANGE_MR 3 +#else /* CONFIG_X86_64 */ +#define NR_RANGE_MR 5 +#endif + +static int save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) +{ + if (start_pfn < end_pfn) { + if (nr_range >= NR_RANGE_MR) + panic("run out of range for init_memory_mapping\n"); + mr[nr_range].start = start_pfn<> PAGE_SHIFT; + pos = start_pfn << PAGE_SHIFT; +#ifdef CONFIG_X86_32 + /* + * Don't use a large page for the first 2/4MB of memory + * because there are often fixed size MTRRs in there + * and overlapping MTRRs into large pages can cause + * slowdowns. + */ + if (pos == 0) + end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); + else + end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#endif + if (end_pfn > (end >> PAGE_SHIFT)) + end_pfn = end >> PAGE_SHIFT; + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + pos = end_pfn << PAGE_SHIFT; + } + + /* big page (2M) range */ + start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); +#ifdef CONFIG_X86_32 + end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); +#else /* CONFIG_X86_64 */ + end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) + end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); +#endif + + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PUD_SHIFT) + << (PUD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & + ((1<>PMD_SHIFT) + << (PMD_SHIFT - PAGE_SHIFT); + end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); + if (start_pfn < end_pfn) { + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, + page_size_mask & (1<>PAGE_SHIFT; + end_pfn = end>>PAGE_SHIFT; + nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); + + /* try to merge same page size and continuous */ + for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { + unsigned long old_start; + if (mr[i].end != mr[i+1].start || + mr[i].page_size_mask != mr[i+1].page_size_mask) + continue; + /* move it */ + old_start = mr[i].start; + memmove(&mr[i], &mr[i+1], + (nr_range - 1 - i) * sizeof(struct map_range)); + mr[i--].start = old_start; + nr_range--; + } + + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG " %010lx - %010lx page %s\n", + mr[i].start, mr[i].end, + (mr[i].page_size_mask & (1<> PAGE_SHIFT, + mr[i].end >> PAGE_SHIFT, + mr[i].page_size_mask == (1< table_start) + reserve_early(table_start << PAGE_SHIFT, + table_end << PAGE_SHIFT, "PGTABLE"); + + if (!after_bootmem) + early_memtest(start, end); + + return ret >> PAGE_SHIFT; +} + /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cd3c24b490a..187522a0c66 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,13 +59,9 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; - -int after_bootmem; - -int direct_gbpages; +extern unsigned long __initdata table_start; +extern unsigned long __meminitdata table_end; +extern unsigned long __meminitdata table_top; static __init void *alloc_low_page(void) { @@ -227,9 +223,9 @@ static inline int is_kernel_text(unsigned long addr) * of max_low_pfn pages, by creating page tables starting from address * PAGE_OFFSET: */ -static void __init kernel_physical_mapping_init(unsigned long start_pfn, - unsigned long end_pfn, - int use_pse) +void __init kernel_physical_mapping_init(unsigned long start_pfn, + unsigned long end_pfn, + int use_pse) { pgd_t *pgd_base = swapper_pg_dir; int pgd_idx, pmd_idx, pte_ofs; @@ -509,7 +505,7 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init early_ioremap_page_table_range_init(void) +void __init early_ioremap_page_table_range_init(void) { pgd_t *pgd_base = swapper_pg_dir; unsigned long vaddr, end; @@ -834,296 +830,6 @@ void __init setup_bootmem_allocator(void) after_bootmem = 1; } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#ifdef CONFIG_X86_32 -#define NR_RANGE_MR 3 -#else /* CONFIG_X86_64 */ -#define NR_RANGE_MR 5 -#endif - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; -#ifdef CONFIG_X86_32 - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); - else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); -#endif - - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof(struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1< table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_bootmem) - early_memtest(start, end); - - return ret >> PAGE_SHIFT; -} - - /* * paging_init() sets up the page tables - note that the first 8MB are * already mapped by head.S. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d101990e463..a32fe075608 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -61,12 +61,6 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages -#ifdef CONFIG_DIRECT_GBPAGES - = 1 -#endif -; - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; @@ -87,8 +81,6 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -int after_bootmem; - pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); @@ -291,9 +283,9 @@ void __init cleanup_highmap(void) } } -static unsigned long __initdata table_start; -static unsigned long __meminitdata table_end; -static unsigned long __meminitdata table_top; +extern unsigned long __initdata table_start; +extern unsigned long __meminitdata table_end; +extern unsigned long __meminitdata table_top; static __ref void *alloc_low_page(unsigned long *phys) { @@ -547,77 +539,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); - - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); -#ifdef CONFIG_X86_32 - extra += PMD_SIZE; -#endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - - tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); - -#ifdef CONFIG_X86_32 - /* for fixmap */ - tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); -#endif - - /* - * RED-PEN putting page tables only on node 0 could - * cause a hotspot and fill up ZONE_DMA. The page tables - * need roughly 0.5KB per GB. - */ -#ifdef CONFIG_X86_32 - start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); - - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); -} - -static void __init init_gbpages(void) -{ - if (direct_gbpages && cpu_has_gbpages) - printk(KERN_INFO "Using GB pages for direct mapping\n"); - else - direct_gbpages = 0; -} - -static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask) +unsigned long __meminit +kernel_physical_mapping_init(unsigned long start, + unsigned long end, + unsigned long page_size_mask) { unsigned long next, last_map_addr = end; @@ -654,231 +579,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start, return last_map_addr; } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -#ifdef CONFIG_X86_32 -#define NR_RANGE_MR 3 -#else /* CONFIG_X86_64 */ -#define NR_RANGE_MR 5 -#endif - -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) -{ - if (start_pfn < end_pfn) { - if (nr_range >= NR_RANGE_MR) - panic("run out of range for init_memory_mapping\n"); - mr[nr_range].start = start_pfn<> PAGE_SHIFT; - pos = start_pfn << PAGE_SHIFT; -#ifdef CONFIG_X86_32 - /* - * Don't use a large page for the first 2/4MB of memory - * because there are often fixed size MTRRs in there - * and overlapping MTRRs into large pages can cause - * slowdowns. - */ - if (pos == 0) - end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT); - else - end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#endif - if (end_pfn > (end >> PAGE_SHIFT)) - end_pfn = end >> PAGE_SHIFT; - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - pos = end_pfn << PAGE_SHIFT; - } - - /* big page (2M) range */ - start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); -#ifdef CONFIG_X86_32 - end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); -#else /* CONFIG_X86_64 */ - end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT))) - end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)); -#endif - - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PUD_SHIFT) - << (PUD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & - ((1<>PMD_SHIFT) - << (PMD_SHIFT - PAGE_SHIFT); - end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); - if (start_pfn < end_pfn) { - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, - page_size_mask & (1<>PAGE_SHIFT; - end_pfn = end>>PAGE_SHIFT; - nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); - - /* try to merge same page size and continuous */ - for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { - unsigned long old_start; - if (mr[i].end != mr[i+1].start || - mr[i].page_size_mask != mr[i+1].page_size_mask) - continue; - /* move it */ - old_start = mr[i].start; - memmove(&mr[i], &mr[i+1], - (nr_range - 1 - i) * sizeof(struct map_range)); - mr[i--].start = old_start; - nr_range--; - } - - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, - (mr[i].page_size_mask & (1<> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1< table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); - - if (!after_bootmem) - early_memtest(start, end); - - return ret >> PAGE_SHIFT; -} - #ifndef CONFIG_NUMA void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) { -- cgit v1.2.3 From 298af9d89f3f5292e81a0a00f729c415adc4d8fb Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:06 +0200 Subject: x86: fix up some bad global variable names in mm/init.c Impact: cleanup The table_start, table_end, and table_top are too generic for global namespace so rename them to be more specific. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-15-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 26 +++++++++++++------------- arch/x86/mm/init_32.c | 14 +++++++------- arch/x86/mm/init_64.c | 10 +++++----- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 3a21b136da2..5bbdfe7459d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -23,9 +23,9 @@ kernel_physical_mapping_init(unsigned long start, unsigned long page_size_mask); #endif -unsigned long __initdata table_start; -unsigned long __meminitdata table_end; -unsigned long __meminitdata table_top; +unsigned long __initdata e820_table_start; +unsigned long __meminitdata e820_table_end; +unsigned long __meminitdata e820_table_top; int after_bootmem; @@ -78,21 +78,21 @@ static void __init find_early_table_space(unsigned long end, int use_pse, */ #ifdef CONFIG_X86_32 start = 0x7000; - table_start = find_e820_area(start, max_pfn_mapped<>= PAGE_SHIFT; - table_end = table_start; - table_top = table_start + (tables >> PAGE_SHIFT); + e820_table_start >>= PAGE_SHIFT; + e820_table_end = e820_table_start; + e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); + end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); } struct map_range { @@ -324,9 +324,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, #endif __flush_tlb_all(); - if (!after_bootmem && table_end > table_start) - reserve_early(table_start << PAGE_SHIFT, - table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_bootmem && e820_table_end > e820_table_start) + reserve_early(e820_table_start << PAGE_SHIFT, + e820_table_end << PAGE_SHIFT, "PGTABLE"); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 187522a0c66..e9df0d9cdeb 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,16 +59,16 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); -extern unsigned long __initdata table_start; -extern unsigned long __meminitdata table_end; -extern unsigned long __meminitdata table_top; +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; static __init void *alloc_low_page(void) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); @@ -149,8 +149,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, if (pmd_idx_kmap_begin != pmd_idx_kmap_end && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end - && ((__pa(pte) >> PAGE_SHIFT) < table_start - || (__pa(pte) >> PAGE_SHIFT) >= table_end)) { + && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start + || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { pte_t *newpte; int i; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a32fe075608..a1d33c58b49 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -283,13 +283,13 @@ void __init cleanup_highmap(void) } } -extern unsigned long __initdata table_start; -extern unsigned long __meminitdata table_end; -extern unsigned long __meminitdata table_top; +extern unsigned long __initdata e820_table_start; +extern unsigned long __meminitdata e820_table_end; +extern unsigned long __meminitdata e820_table_top; static __ref void *alloc_low_page(unsigned long *phys) { - unsigned long pfn = table_end++; + unsigned long pfn = e820_table_end++; void *adr; if (after_bootmem) { @@ -299,7 +299,7 @@ static __ref void *alloc_low_page(unsigned long *phys) return adr; } - if (pfn >= table_top) + if (pfn >= e820_table_top) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); -- cgit v1.2.3 From e53fb04fce6d246ebed755b904ed1b0b814a754c Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:07 +0200 Subject: x86: unify kernel_physical_mapping_init() function signatures Impact: cleanup In preparation for moving the function declaration to a header file, unify 32-bit and 64-bit signatures. Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-16-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 13 +++---------- arch/x86/mm/init_32.c | 13 ++++++++++--- arch/x86/mm/init_64.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 5bbdfe7459d..6475693a81a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -11,17 +11,12 @@ #ifdef CONFIG_X86_32 extern void __init early_ioremap_page_table_range_init(void); -extern void __init kernel_physical_mapping_init(unsigned long start_pfn, - unsigned long end_pfn, - int use_pse); #endif -#ifdef CONFIG_X86_64 -extern unsigned long __meminit +extern unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask); -#endif unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; @@ -301,10 +296,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, #ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) - kernel_physical_mapping_init( - mr[i].start >> PAGE_SHIFT, - mr[i].end >> PAGE_SHIFT, - mr[i].page_size_mask == (1<> PAGE_SHIFT; + end_pfn = end >> PAGE_SHIFT; + /* * First iteration will setup identity mapping using large/small pages * based on use_pse, with other attributes same as set by @@ -350,6 +356,7 @@ repeat: mapping_iter = 2; goto repeat; } + return 0; } pte_t *kmap_pte; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a1d33c58b49..f441ae31631 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -539,7 +539,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -unsigned long __meminit +unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) -- cgit v1.2.3 From 4fcb208391be5cf82c6fe2779c5eb9245ac97e91 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 14:55:08 +0200 Subject: x86: move function and variable declarations to asm/init.h Impact: cleanup Signed-off-by: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-17-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 10 +--------- arch/x86/mm/init_32.c | 6 +----- arch/x86/mm/init_64.c | 5 +---- 3 files changed, 3 insertions(+), 18 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6475693a81a..6d63e3d1253 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,21 +3,13 @@ #include #include +#include #include #include #include #include #include -#ifdef CONFIG_X86_32 -extern void __init early_ioremap_page_table_range_init(void); -#endif - -extern unsigned long __init -kernel_physical_mapping_init(unsigned long start, - unsigned long end, - unsigned long page_size_mask); - unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; unsigned long __meminitdata e820_table_top; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5ca9c6c3439..1669693e97d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,6 +49,7 @@ #include #include #include +#include unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; @@ -58,11 +59,6 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); - -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; - static __init void *alloc_low_page(void) { unsigned long pfn = e820_table_end++; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f441ae31631..7dd7ce49d69 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -48,6 +48,7 @@ #include #include #include +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -283,10 +284,6 @@ void __init cleanup_highmap(void) } } -extern unsigned long __initdata e820_table_start; -extern unsigned long __meminitdata e820_table_end; -extern unsigned long __meminitdata e820_table_top; - static __ref void *alloc_low_page(unsigned long *phys) { unsigned long pfn = e820_table_end++; -- cgit v1.2.3 From 62436fe9ee10f5e0dd087b106d69d93c9549935a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 14:39:03 +0100 Subject: x86: move init_memory_mapping() to common mm/init.c, build fix on 32-bit PAE Impact: build fix Cc: Pekka Enberg Cc: Yinghai Lu LKML-Reference: <1236257708-27269-14-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 1669693e97d..5e5126e0d54 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -605,7 +605,7 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -static void __init set_nx(void) +void __init set_nx(void) { unsigned int v[4], l, h; -- cgit v1.2.3 From dc16ecf7fd1fad7436832121435d4926a81d469e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:10:44 -0800 Subject: x86-32: use specific __vmalloc_start_set flag in __virt_addr_valid Rather than relying on the ever-unreliable system_state, add a specific __vmalloc_start_set flag to indicate whether the vmalloc area has meaningful boundaries yet, and use that in x86-32's __phys_addr and __virt_addr_valid. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/ioremap.c | 7 +++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5e5126e0d54..d57dfffb021 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -59,6 +59,8 @@ unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); +bool __read_mostly __vmalloc_start_set = false; + static __init void *alloc_low_page(void) { unsigned long pfn = e820_table_end++; @@ -757,6 +759,8 @@ void __init initmem_init(unsigned long start_pfn, #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif + __vmalloc_start_set = true; + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 433f7bd4648..a23ca5b5bf2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -76,10 +76,9 @@ static inline int phys_addr_valid(unsigned long addr) #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - /* VMALLOC_* aren't constants; not available at the boot time */ + /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *) x)); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); @@ -89,7 +88,7 @@ bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; - if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } -- cgit v1.2.3 From ed26dbe5ae045e5bf95c6dc27497397a3fde52e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:16:51 -0800 Subject: x86: pre-initialize boot_cpu_data.x86_phys_bits to avoid system_state tests Impact: cleanup, micro-optimization Pre-initialize boot_cpu_data.x86_phys_bits to a reasonable default to remove the use of system_state tests in __virt_addr_valid() and __phys_addr(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a23ca5b5bf2..62773abdf08 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x) } else { VIRTUAL_BUG_ON(x < PAGE_OFFSET); x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : - !phys_addr_valid(x)); + VIRTUAL_BUG_ON(!phys_addr_valid(x)); } return x; } @@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x) if (x < PAGE_OFFSET) return false; x -= PAGE_OFFSET; - if (system_state == SYSTEM_BOOTING ? - x > MAXMEM : !phys_addr_valid(x)) { + if (!phys_addr_valid(x)) return false; - } } return pfn_valid(x >> PAGE_SHIFT); -- cgit v1.2.3 From d1a8e7792047f7dca7eb5759250e2c12800bf262 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 6 Mar 2009 03:12:50 -0800 Subject: x86: make "memtest" like "memtest=17" Impact: make boot command line "memtest" do one loop by default So don't need to guess many patterns in one loop. Signed-off-by: Yinghai Lu LKML-Reference: <49B10532.3020105@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 0bcd7883d03..605c8be0621 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg) { if (arg) memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + return 0; } -- cgit v1.2.3 From c77a3b59c624454c501cbfa1a3611d5a00bf9532 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 17:04:26 +0200 Subject: x86: fix uninitialized variable in init_memory_mapping() Signed-off-by: Pekka Enberg LKML-Reference: <1236265466.31324.9.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6d63e3d1253..15219e0d124 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -134,8 +134,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, { unsigned long page_size_mask = 0; unsigned long start_pfn, end_pfn; + unsigned long ret = 0; unsigned long pos; - unsigned long ret; struct map_range mr[NR_RANGE_MR]; int nr_range, i; -- cgit v1.2.3 From 5dd61dfabcaa5bfb67afb8a2d255bd1e156562e3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 5 Mar 2009 17:04:57 +0200 Subject: x86: rename do_not_nx to disable_nx in mm/init_64.c As a preparational step for unifying noexec handling on 32-bit and 64-bit, rename the do_not_nx variable to disable_nx on 64-bit. Signed-off-by: Pekka Enberg LKML-Reference: <1236265497.31324.11.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8a853bc3b28..54efa57d1c0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -85,7 +85,7 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int do_not_nx __cpuinitdata; +static int disable_nx __cpuinitdata; /* * noexec=on|off @@ -100,9 +100,9 @@ static int __init nonx_setup(char *str) return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; + disable_nx = 0; } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; + disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } return 0; @@ -114,7 +114,7 @@ void __cpuinit check_efer(void) unsigned long efer; rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) + if (!(efer & EFER_NX) || disable_nx) __supported_pte_mask &= ~_PAGE_NX; } -- cgit v1.2.3 From 8827247ffcc9e880cbe4705655065cf011265157 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Sat, 7 Mar 2009 13:34:19 +0800 Subject: x86: don't define __this_fixmap_does_not_exist() Impact: improve out-of-range fixmap index debugging Commit "1b42f51630c7eebce6fb780b480731eb81afd325" defined the __this_fixmap_does_not_exist() function with a WARN_ON(1) in it. This causes the linker to not report an error when __this_fixmap_does_not_exist() is called with a non-constant parameter. Ingo defined __this_fixmap_does_not_exist() because he wanted to get virt addresses of fix memory of nest level by non-constant index. But we can fix this and still keep the link-time check: We can get the four slot virt addresses on link time and store them to array slot_virt[]. Then we can then refer the slot_virt with non-constant index, in the ioremap-leak detection code. Signed-off-by: Wang Chen LKML-Reference: <49B2075B.4070509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 62773abdf08..96786ef2c9a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -504,13 +504,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) return &bm_pte[pte_index(addr)]; } +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + void __init early_ioremap_init(void) { pmd_t *pmd; + int i; if (early_ioremap_debug) printk(KERN_INFO "early_ioremap_init()\n"); + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); pmd_populate_kernel(&init_mm, pmd, bm_pte); @@ -577,6 +583,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; + static int __init check_early_ioremap_leak(void) { int count = 0; @@ -598,7 +605,8 @@ static int __init check_early_ioremap_leak(void) } late_initcall(check_early_ioremap_leak); -static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) +static void __init __iomem * +__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; unsigned int nrpages; @@ -664,9 +672,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo --nrpages; } if (early_ioremap_debug) - printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]); - prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0)); + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; } @@ -734,8 +742,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) } prev_map[slot] = NULL; } - -void __this_fixmap_does_not_exist(void) -{ - WARN_ON(1); -} -- cgit v1.2.3 From e954ef20c29b7af07a8cb5452f14fb69e3d9d2b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 5 Mar 2009 12:04:57 -0800 Subject: x86: fix warning about nodeid Impact: cleanup Ingo found there warning about nodeid with some configs. try to use for_each_online_node for non numa too. in that case nodeid will be 0. also move out boundary checking from setup_node_bootmem(), so non-numa config will not check it. Signed-off-by: Yinghai Lu Cc: Andrew Morton LKML-Reference: <49B03069.80001@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2966c6b8d30..db81e9a8556 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -806,11 +806,6 @@ static unsigned long __init setup_node_bootmem(int nodeid, { unsigned long bootmap_size; - if (start_pfn > max_low_pfn) - return bootmap; - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; - /* don't touch min_low_pfn */ bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap >> PAGE_SHIFT, @@ -843,13 +838,23 @@ void __init setup_bootmem_allocator(void) max_pfn_mapped< max_low_pfn) + continue; + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; #else - bootmap = setup_node_bootmem(0, 0, max_low_pfn, bootmap); + start_pfn = 0; + end_pfn = max_low_pfn; #endif + bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, + bootmap); + } after_bootmem = 1; } -- cgit v1.2.3 From 0feca851c1b3cb4ebfa3149144b3d5de0879ebaa Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 6 Mar 2009 10:09:26 -0800 Subject: x86-32: make sure virt_addr_valid() returns false for fixmap addresses I found that virt_addr_valid() was returning true for fixmap addresses. I'm not sure whether pfn_valid() is supposed to include this test, but there's no harm in being explicit. Signed-off-by: Jeremy Fitzhardinge Cc: Jiri Slaby Cc: Yinghai Lu LKML-Reference: <49B166D6.2080505@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 62773abdf08..62def579573 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -87,6 +87,8 @@ bool __virt_addr_valid(unsigned long x) return false; if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; + if (x >= FIXADDR_START) + return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); -- cgit v1.2.3 From bb6d59ca927d855ffac567b35c0a790c67016103 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 11 Mar 2009 23:33:18 +0900 Subject: x86: unify kmap_atomic_pfn() and iomap_atomic_prot_pfn() kmap_atomic_pfn() and iomap_atomic_prot_pfn() are almost same except pgprot. This patch removes the code duplication for these two functions. Signed-off-by: Akinobu Mita LKML-Reference: <20090311143317.GA22244@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 17 +++++++++++------ arch/x86/mm/iomap_32.c | 13 ++----------- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index d11745334a6..ae4c8dae266 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -121,23 +121,28 @@ void kunmap_atomic(void *kvaddr, enum km_type type) pagefault_enable(); } -/* This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; pagefault_disable(); - idx = type + KM_TYPE_NR*smp_processor_id(); + idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); arch_flush_lazy_mmu_mode(); return (void*) vaddr; } + +/* This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +{ + return kmap_atomic_prot_pfn(pfn, type, kmap_prot); +} EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 04102d42ff4..592984e5496 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -18,6 +18,7 @@ #include #include +#include #include int is_io_mapping_possible(resource_size_t base, unsigned long size) @@ -36,11 +37,6 @@ EXPORT_SYMBOL_GPL(is_io_mapping_possible); void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { - enum fixed_addresses idx; - unsigned long vaddr; - - pagefault_disable(); - /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the @@ -50,12 +46,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void*) vaddr; + return kmap_atomic_prot_pfn(pfn, type, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); -- cgit v1.2.3 From 12074fa1073013dd11f1cff41db018d5cff4ecd9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 11 Mar 2009 23:34:50 +0900 Subject: x86: debug check for kmap_atomic_pfn and iomap_atomic_prot_pfn() It may be useful for kmap_atomic_pfn() and iomap_atomic_prot_pfn() to check invalid kmap usage as well as kmap_atomic. Signed-off-by: Akinobu Mita LKML-Reference: <20090311143449.GB22244@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index ae4c8dae266..f256e73542d 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -128,6 +128,8 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) pagefault_disable(); + debug_kmap_atomic_prot(type); + idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); -- cgit v1.2.3 From afcfe024aebd74b0984a41af9a34e009cf5badaf Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Wed, 11 Mar 2009 20:29:45 +0000 Subject: x86: mmiotrace: quieten spurious warning message This message was being incorrectly emitted when using gdb, so compile it out by default for now; there will be a better fix in v2.6.30. Reported-by: Ingo Molnar Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 6a518dd08a3..4f115e00486 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -310,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_warning("kmmio: spurious debug trap on CPU %d.\n", + pr_debug("kmmio: spurious debug trap on CPU %d.\n", smp_processor_id()); goto out; } -- cgit v1.2.3 From dd63fdcc63f0f853b116b52e56200a0e0227cf5f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 03:20:49 +0100 Subject: x86: unify kmap_atomic_pfn() and iomap_atomic_prot_pfn(), fix Impact: build fix Move kmap_atomic_prot_pfn() to iomap_32.c. It is used on all 32-bit kernels, while highmem_32.c is only built on highmem kernels. ( Note: the debug_kmap_atomic_prot() check is removed for now, that problem is handled via another patch. ) Reported-by: Thomas Gleixner Cc: Akinobu Mita LKML-Reference: <20090311143317.GA22244@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 20 ++------------------ arch/x86/mm/iomap_32.c | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index f256e73542d..522db5e3d0b 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -121,24 +121,8 @@ void kunmap_atomic(void *kvaddr, enum km_type type) pagefault_enable(); } -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - pagefault_disable(); - - debug_kmap_atomic_prot(type); - - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void*) vaddr; -} - -/* This is the same as kmap_atomic() but can map memory that doesn't +/* + * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 592984e5496..6e60ba698ce 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -32,7 +32,23 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(is_io_mapping_possible); -/* Map 'pfn' using fixed map 'type' and protections 'prot' +void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + pagefault_disable(); + + idx = type + KM_TYPE_NR * smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); + arch_flush_lazy_mmu_mode(); + + return (void *)vaddr; +} + +/* + * Map 'pfn' using fixed map 'type' and protections 'prot' */ void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) -- cgit v1.2.3 From 13c6c53282d99c82e79b02477efd2c1e30a991ef Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:37:34 +0000 Subject: x86, 32-bit: also use cpuinfo_x86's x86_{phys,virt}_bits members Impact: 32/64-bit consolidation In a first step, this allows fixing phys_addr_valid() for PAE (which until now reported all addresses to be valid). Subsequently, this will also allow simplifying some MTRR handling code. Signed-off-by: Jan Beulich LKML-Reference: <49B9101E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index aca924a30ee..83ed74affba 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -22,13 +22,17 @@ #include #include -#ifdef CONFIG_X86_64 - -static inline int phys_addr_valid(unsigned long addr) +static inline int phys_addr_valid(resource_size_t addr) { - return addr < (1UL << boot_cpu_data.x86_phys_bits); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif } +#ifdef CONFIG_X86_64 + unsigned long __phys_addr(unsigned long x) { if (x >= __START_KERNEL_map) { @@ -65,11 +69,6 @@ EXPORT_SYMBOL(__virt_addr_valid); #else -static inline int phys_addr_valid(unsigned long addr) -{ - return 1; -} - #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { -- cgit v1.2.3 From dc9dd5cc854cde110d2421f3a11fec7597e059c1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 12:40:06 +0000 Subject: x86: move save_mr() into .meminit.text Impact: cleanup, save memory The function is only being called from boot or memory hotplug paths. Signed-off-by: Jan Beulich LKML-Reference: <49B910B6.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 15219e0d124..fd3da1dda1c 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -94,9 +94,9 @@ struct map_range { #define NR_RANGE_MR 5 #endif -static int save_mr(struct map_range *mr, int nr_range, - unsigned long start_pfn, unsigned long end_pfn, - unsigned long page_size_mask) +static int __meminit save_mr(struct map_range *mr, int nr_range, + unsigned long start_pfn, unsigned long end_pfn, + unsigned long page_size_mask) { if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) -- cgit v1.2.3 From 698609bdcd35d0641f4c6622c83680ab1a6d67cb Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 12 Mar 2009 13:11:50 +0000 Subject: x86: create a non-zero sized bm_pte only when needed Impact: kernel image size reduction Since in most configurations the pmd page needed maps the same range of virtual addresses which is also mapped by the earlier inserted one for covering FIX_DBGP_BASE, that page (and its insertion in the page tables) can be avoided altogether by detecting the condition at compile time. Signed-off-by: Jan Beulich LKML-Reference: <49B91826.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 83ed74affba..55e127f71ed 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,7 +487,12 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; +#define __FIXADDR_TOP (-PAGE_SIZE) +static pte_t bm_pte[(__fix_to_virt(FIX_DBGP_BASE) + ^ __fix_to_virt(FIX_BTMAP_BEGIN)) >> PMD_SHIFT + ? PAGE_SIZE / sizeof(pte_t) : 0] __page_aligned_bss; +#undef __FIXADDR_TOP +static __initdata pte_t *bm_ptep; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { @@ -502,6 +507,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) static inline pte_t * __init early_ioremap_pte(unsigned long addr) { + if (!sizeof(bm_pte)) + return &bm_ptep[pte_index(addr)]; return &bm_pte[pte_index(addr)]; } @@ -519,8 +526,14 @@ void __init early_ioremap_init(void) slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); - memset(bm_pte, 0, sizeof(bm_pte)); - pmd_populate_kernel(&init_mm, pmd, bm_pte); + if (sizeof(bm_pte)) { + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); + } else { + bm_ptep = pte_offset_kernel(pmd, 0); + if (early_ioremap_debug) + printk(KERN_INFO "bm_ptep=%p\n", bm_ptep); + } /* * The boot-ioremap range spans multiple pmds, for which -- cgit v1.2.3 From 4bb9c5c02153dfc89a6c73a6f32091413805ad7d Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 12 Mar 2009 17:45:27 -0700 Subject: VM, x86, PAT: Change is_linear_pfn_mapping to not use vm_pgoff Impact: fix false positive PAT warnings - also fix VirtalBox hang Use of vma->vm_pgoff to identify the pfnmaps that are fully mapped at mmap time is broken. vm_pgoff is set by generic mmap code even for cases where drivers are setting up the mappings at the fault time. The problem was originally reported here: http://marc.info/?l=linux-kernel&m=123383810628583&w=2 Change is_linear_pfn_mapping logic to overload VM_INSERTPAGE flag along with VM_PFNMAP to mean full PFNMAP setup at mmap time. Problem also tracked at: http://bugzilla.kernel.org/show_bug.cgi?id=12800 Reported-by: Thomas Hellstrom Tested-by: Frans Pop Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha @intel.com> Cc: Nick Piggin Cc: "ebiederm@xmission.com" Cc: # only for 2.6.29.1, not .28 LKML-Reference: <20090313004527.GA7176@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e0ab173b697..21bc1f787ae 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -641,10 +641,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. + * reserve_pfn_range() doesn't support RAM pages. Maintain the current + * behavior with RAM pages by returning success. */ if (is_ram != 0) - return -EINVAL; + return 0; ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) -- cgit v1.2.3 From 93dbda7cbcd70a0bd1a99f39f44a9ccde8ab9040 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Feb 2009 17:35:44 -0800 Subject: x86: add brk allocation for very, very early allocations Impact: new interface Add a brk()-like allocator which effectively extends the bss in order to allow very early code to do dynamic allocations. This is better than using statically allocated arrays for data in subsystems which may never get used. The space for brk allocations is in the bss ELF segment, so that the space is mapped properly by the code which maps the kernel, and so that bootloaders keep the space free rather than putting a ramdisk or something into it. The bss itself, delimited by __bss_stop, ends before the brk area (__brk_base to __brk_limit). The kernel text, data and bss is reserved up to __bss_stop. Any brk-allocated data is reserved separately just before the kernel pagetable is built, as that code allocates from unreserved spaces in the e820 map, potentially allocating from any unused brk memory. Ultimately any unused memory in the brk area is used in the general kernel memory pool. Initially the brk space is set to 1MB, which is probably much larger than any user needs (the largest current user is i386 head_32.S's code to build the pagetables to map the kernel, which can get fairly large with a big kernel image and no PSE support). So long as the system has sufficient memory for the bootloader to reserve the kernel+1MB brk, there are no bad effects resulting from an over-large brk. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/mm/pageattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 9c4294986af..1280565670e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void) static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -711,7 +712,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the high * mapping already: */ - if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, _brk_end)) return 0; /* -- cgit v1.2.3 From 0920dce7d5889634faa336f65833ee44f3b7651e Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 16 Mar 2009 00:15:18 +0900 Subject: x86, mm: remove unnecessary include file from iomap_32.c asm/highmem.h inclusion is added to use kmap_atomic_prot_pfn() by commit bb6d59ca927d855ffac567b35c0a790c67016103 Now kmap_atomic_prot_pfn is moved to iomap_32.c by commit dd63fdcc63f0f853b116b52e56200a0e0227cf5f So the asm/highmem.h inclusion in iomap_32.c is unnecessary now. Signed-off-by: Akinobu Mita LKML-Reference: <20090315151517.GA29074@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 6e60ba698ce..699c9b2895a 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -18,7 +18,6 @@ #include #include -#include #include int is_io_mapping_possible(resource_size_t base, unsigned long size) -- cgit v1.2.3 From ce4e240c279a31096f74afa6584a62d64a1ba8c8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 17 Mar 2009 10:16:54 -0800 Subject: x86: add x2apic_wrmsr_fence() to x2apic flush tlb paths Impact: optimize APIC IPI related barriers Uncached MMIO accesses for xapic are inherently serializing and hence we don't need explicit barriers for xapic IPI paths. x2apic MSR writes/reads don't have serializing semantics and hence need a serializing instruction or mfence, to make all the previous memory stores globally visisble before the x2apic msr write for IPI. Add x2apic_wrmsr_fence() in flush tlb path to x2apic specific paths. Signed-off-by: Suresh Siddha Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Jens Axboe Cc: Linus Torvalds Cc: "Paul E. McKenney" Cc: Rusty Russell Cc: Steven Rostedt Cc: "steiner@sgi.com" Cc: Nick Piggin LKML-Reference: <1237313814.27006.203.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a654d59e448..821e97017e9 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -186,11 +186,6 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id())); - /* - * Make the above memory operations globally visible before - * sending the IPI. - */ - smp_mb(); /* * We have to send the IPI only to * CPUs affected. -- cgit v1.2.3 From b40c757964bbad76ecfa88eda9eb0b4d76dd8b40 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Mar 2009 13:03:32 -0700 Subject: x86/32: no need to use set_pte_present in set_pte_vaddr Impact: cleanup, remove last user of set_pte_present set_pte_vaddr() is only used to install ptes in fixmaps, and should never be used to overwrite a present mapping. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel LKML-Reference: <1237406613-2929-1-git-send-email-jeremy@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index f2e477c91c1..46c8834aedc 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -50,7 +50,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) } pte = pte_offset_kernel(pmd, vaddr); if (pte_val(pteval)) - set_pte_present(&init_mm, vaddr, pte, pteval); + set_pte_at(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); -- cgit v1.2.3 From 728c9518873de0bbb92b66daa1943b12e5b9f80f Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 19 Mar 2009 14:51:13 -0700 Subject: x86, CPA: Add a flag parameter to cpa set_clr() Change change_page_attr_set_clr() array parameter to a flag. This helps following patches which adds an interface to change attr to uc/wb over a set of pages referred by struct page. Signed-off-by: Venkatesh Pallipadi Cc: arjan@infradead.org Cc: eric@anholt.net Cc: Venkatesh Pallipadi Cc: airlied@redhat.com LKML-Reference: <20090319215358.611346000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1280565670e..69009afa98c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -787,7 +787,7 @@ static inline int cache_attr(pgprot_t attr) static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split, int array) + int force_split, int in_flag) { struct cpa_data cpa; int ret, cache, checkalias; @@ -802,7 +802,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, return 0; /* Ensure we are PAGE_SIZE aligned */ - if (!array) { + if (!(in_flag & CPA_ARRAY)) { if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; /* @@ -840,7 +840,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.curpage = 0; cpa.force_split = force_split; - if (array) + if (in_flag & CPA_ARRAY) cpa.flags |= CPA_ARRAY; /* No alias checking for _NX bit modifications */ @@ -889,14 +889,14 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, - array); + (array ? CPA_ARRAY : 0)); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, - array); + (array ? CPA_ARRAY : 0)); } int _set_memory_uc(unsigned long addr, int numpages) -- cgit v1.2.3 From 9ae2847591c857bed44bc094b908b412bfa1b244 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 19 Mar 2009 14:51:14 -0700 Subject: x86, PAT: Add support for struct page pointer array in cpa set_clr Add struct page array pointer to cpa struct and CPA_PAGES_ARRAY. With that we can add change_page_attr_set_clr() a parameter to pass struct page array pointer and that can be handled by the underlying cpa code. cpa_flush_array() is also changed to support both addr array or struct page pointer array, depending on the flag. Signed-off-by: Venkatesh Pallipadi Cc: arjan@infradead.org Cc: eric@anholt.net Cc: airlied@redhat.com LKML-Reference: <20090319215358.758513000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 79 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 29 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 69009afa98c..e5c257fb41e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -34,6 +34,7 @@ struct cpa_data { unsigned long pfn; unsigned force_split : 1; int curpage; + struct page **pages; }; /* @@ -46,6 +47,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 +#define CPA_PAGES_ARRAY 4 #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -202,10 +204,10 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache) +static void cpa_flush_array(unsigned long *start, int numpages, int cache, + int in_flags, struct page **pages) { unsigned int i, level; - unsigned long *addr; BUG_ON(irqs_disabled()); @@ -226,14 +228,22 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache) * will cause all other CPUs to flush the same * cachelines: */ - for (i = 0, addr = start; i < numpages; i++, addr++) { - pte_t *pte = lookup_address(*addr, &level); + for (i = 0; i < numpages; i++) { + unsigned long addr; + pte_t *pte; + + if (in_flags & CPA_PAGES_ARRAY) + addr = (unsigned long)page_address(pages[i]); + else + addr = start[i]; + + pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *) *addr, PAGE_SIZE); + clflush_cache_range((void *)addr, PAGE_SIZE); } } @@ -585,7 +595,9 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) + address = (unsigned long)page_address(cpa->pages[cpa->curpage]); + else if (cpa->flags & CPA_ARRAY) address = cpa->vaddr[cpa->curpage]; else address = *cpa->vaddr; @@ -688,7 +700,9 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & CPA_PAGES_ARRAY) + vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]); + else if (cpa->flags & CPA_ARRAY) vaddr = cpa->vaddr[cpa->curpage]; else vaddr = *cpa->vaddr; @@ -699,7 +713,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~CPA_ARRAY; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); ret = __change_page_attr_set_clr(&alias_cpa, 0); @@ -725,7 +739,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa.vaddr = &temp_cpa_vaddr; - alias_cpa.flags &= ~CPA_ARRAY; + alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); /* * The high mapping range is imprecise, so ignore the return value. @@ -746,7 +760,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ cpa->numpages = numpages; /* for array changes, we can't use large page */ - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc) @@ -770,7 +784,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - if (cpa->flags & CPA_ARRAY) + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) cpa->curpage++; else *cpa->vaddr += cpa->numpages * PAGE_SIZE; @@ -787,7 +801,8 @@ static inline int cache_attr(pgprot_t attr) static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split, int in_flag) + int force_split, int in_flag, + struct page **pages) { struct cpa_data cpa; int ret, cache, checkalias; @@ -802,15 +817,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, return 0; /* Ensure we are PAGE_SIZE aligned */ - if (!(in_flag & CPA_ARRAY)) { - if (*addr & ~PAGE_MASK) { - *addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); - } - } else { + if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { @@ -818,6 +825,18 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, WARN_ON_ONCE(1); } } + } else if (!(in_flag & CPA_PAGES_ARRAY)) { + /* + * in_flag of CPA_PAGES_ARRAY implies it is aligned. + * No need to cehck in that case + */ + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } } /* Must avoid aliasing mappings in the highmem code */ @@ -833,6 +852,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, arch_flush_lazy_mmu_mode(); cpa.vaddr = addr; + cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; @@ -840,8 +860,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa.curpage = 0; cpa.force_split = force_split; - if (in_flag & CPA_ARRAY) - cpa.flags |= CPA_ARRAY; + if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) + cpa.flags |= in_flag; /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -867,9 +887,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, * wbindv): */ if (!ret && cpu_has_clflush) { - if (cpa.flags & CPA_ARRAY) - cpa_flush_array(addr, numpages, cache); - else + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(addr, numpages, cache, + cpa.flags, pages); + } else cpa_flush_range(*addr, numpages, cache); } else cpa_flush_all(cache); @@ -889,14 +910,14 @@ static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, - (array ? CPA_ARRAY : 0)); + (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, - (array ? CPA_ARRAY : 0)); + (array ? CPA_ARRAY : 0), NULL); } int _set_memory_uc(unsigned long addr, int numpages) @@ -1044,7 +1065,7 @@ int set_memory_np(unsigned long addr, int numpages) int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), - __pgprot(0), 1, 0); + __pgprot(0), 1, 0, NULL); } int set_pages_uc(struct page *page, int numpages) -- cgit v1.2.3 From 0f3507555f6fa4acbc85a646d6e8766230db38fc Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Thu, 19 Mar 2009 14:51:15 -0700 Subject: x86, CPA: Add set_pages_arrayuc and set_pages_array_wb Add new interfaces: set_pages_array_uc() set_pages_array_wb() that can be used change the page attribute for a bunch of pages with flush etc done once at the end of all the changes. These interfaces are similar to existing set_memory_array_uc() and set_memory_array_wc(). Signed-off-by: Venkatesh Pallipadi Cc: arjan@infradead.org Cc: eric@anholt.net Cc: airlied@redhat.com LKML-Reference: <20090319215358.901545000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e5c257fb41e..d71e1b636ce 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -920,6 +920,20 @@ static inline int change_page_attr_clear(unsigned long *addr, int numpages, (array ? CPA_ARRAY : 0), NULL); } +static inline int cpa_set_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, + CPA_PAGES_ARRAY, pages); +} + +static inline int cpa_clear_pages_array(struct page **pages, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, + CPA_PAGES_ARRAY, pages); +} + int _set_memory_uc(unsigned long addr, int numpages) { /* @@ -1076,6 +1090,35 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + unsigned long start; + unsigned long end; + int i; + int free_idx; + + for (i = 0; i < addrinarray; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + goto err_out; + } + + if (cpa_set_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { + return 0; /* Success */ + } +err_out: + free_idx = i; + for (i = 0; i < free_idx; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + free_memtype(start, end); + } + return -EINVAL; +} +EXPORT_SYMBOL(set_pages_array_uc); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -1084,6 +1127,26 @@ int set_pages_wb(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_wb); +int set_pages_array_wb(struct page **pages, int addrinarray) +{ + int retval; + unsigned long start; + unsigned long end; + int i; + + retval = cpa_clear_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_MASK)); + + for (i = 0; i < addrinarray; i++) { + start = (unsigned long)page_address(pages[i]); + end = start + PAGE_SIZE; + free_memtype(start, end); + } + + return retval; +} +EXPORT_SYMBOL(set_pages_array_wb); + int set_pages_x(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.3 From 45c7b28f3c7e3a45cc5a597cc19816a9015ee8ae Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 20 Mar 2009 17:53:34 -0700 Subject: Revert "x86: create a non-zero sized bm_pte only when needed" This reverts commit 698609bdcd35d0641f4c6622c83680ab1a6d67cb. 69860 breaks Xen booting, as it relies on head*.S to set up the fixmap pagetables (as a side-effect of initializing the USB debug port). Xen, however, does not boot via head*.S, and so the fixmap area is not initialized. The specific symptom of the crash is a fault in dmi_scan(), because the pointer that early_ioremap returns is not actually present. Signed-off-by: Jeremy Fitzhardinge Cc: Jan Beulich LKML-Reference: <49C43A8E.5090203@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 55e127f71ed..83ed74affba 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -487,12 +487,7 @@ static int __init early_ioremap_debug_setup(char *str) early_param("early_ioremap_debug", early_ioremap_debug_setup); static __initdata int after_paging_init; -#define __FIXADDR_TOP (-PAGE_SIZE) -static pte_t bm_pte[(__fix_to_virt(FIX_DBGP_BASE) - ^ __fix_to_virt(FIX_BTMAP_BEGIN)) >> PMD_SHIFT - ? PAGE_SIZE / sizeof(pte_t) : 0] __page_aligned_bss; -#undef __FIXADDR_TOP -static __initdata pte_t *bm_ptep; +static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) { @@ -507,8 +502,6 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) static inline pte_t * __init early_ioremap_pte(unsigned long addr) { - if (!sizeof(bm_pte)) - return &bm_ptep[pte_index(addr)]; return &bm_pte[pte_index(addr)]; } @@ -526,14 +519,8 @@ void __init early_ioremap_init(void) slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); - if (sizeof(bm_pte)) { - memset(bm_pte, 0, sizeof(bm_pte)); - pmd_populate_kernel(&init_mm, pmd, bm_pte); - } else { - bm_ptep = pte_offset_kernel(pmd, 0); - if (early_ioremap_debug) - printk(KERN_INFO "bm_ptep=%p\n", bm_ptep); - } + memset(bm_pte, 0, sizeof(bm_pte)); + pmd_populate_kernel(&init_mm, pmd, bm_pte); /* * The boot-ioremap range spans multiple pmds, for which -- cgit v1.2.3 From 9f4f25c86ff2233dd98d4bd6968afb1ca66558a0 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Wed, 25 Mar 2009 14:07:11 +0100 Subject: x86: early_ioremap_init(), use __fix_to_virt(), because we are sure it's safe Tetsuo Handa reported this link bug: | arch/x86/mm/built-in.o(.init.text+0x1831): In function `early_ioremap_init': | : undefined reference to `__this_fixmap_does_not_exist' | make: *** [.tmp_vmlinux1] Error 1 Commit:8827247ffcc9e880cbe4705655065cf011265157 used a variable (which would be optimized to constant) as fix_to_virt()'s parameter. It's depended on gcc's optimization and fails on old gcc. (Tetsuo used gcc 3.3) We can use __fix_to_vir() instead, because we know it's safe and don't need link time error reporting. Reported-by: Tetsuo Handa Signed-off-by: Wang Chen Cc: sfr@canb.auug.org.au LKML-Reference: <49C9FFEA.7060908@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 83ed74affba..0dfa09d69e8 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -516,7 +516,7 @@ void __init early_ioremap_init(void) printk(KERN_INFO "early_ioremap_init()\n"); for (i = 0; i < FIX_BTMAPS_SLOTS; i++) - slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); memset(bm_pte, 0, sizeof(bm_pte)); -- cgit v1.2.3 From f4112de6b679d84bd9b9681c7504be7bdfb7c7d5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:25 -0700 Subject: mm: introduce debug_kmap_atomic x86 has debug_kmap_atomic_prot() which is error checking function for kmap_atomic. It is usefull for the other architectures, although it needs CONFIG_TRACE_IRQFLAGS_SUPPORT. This patch exposes it to the other architectures. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/highmem_32.c | 45 +-------------------------------------------- 1 file changed, 1 insertion(+), 44 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 522db5e3d0b..8126e8d1a2a 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -19,49 +19,6 @@ void kunmap(struct page *page) kunmap_high(page); } -static void debug_kmap_atomic_prot(enum km_type type) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - static unsigned warn_count = 10; - - if (unlikely(warn_count == 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -#endif -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -81,7 +38,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic_prot(type); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -- cgit v1.2.3 From 7ca43e7564679604d86e9ed834e7bbcffd8a4a3f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Mar 2009 15:23:25 -0700 Subject: mm: use debug_kmap_atomic Use debug_kmap_atomic in kmap_atomic, kmap_atomic_pfn, and iomap_atomic_prot_pfn. Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/highmem_32.c | 1 + arch/x86/mm/iomap_32.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8126e8d1a2a..5bc5d1688c1 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -40,6 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) debug_kmap_atomic(type); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 699c9b2895a..bff0c9032f8 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -19,6 +19,7 @@ #include #include #include +#include int is_io_mapping_possible(resource_size_t base, unsigned long size) { @@ -71,6 +72,7 @@ iounmap_atomic(void *kvaddr, enum km_type type) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + debug_kmap_atomic(type); /* * Force other mappings to Oops if they'll try to access this pte * without first remap it. Keeping stale mappings around is a bad idea -- cgit v1.2.3 From a7f8c50d90a4e983c456ae75e534b5cd6c03674b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 2 Apr 2009 16:01:26 +0900 Subject: x86, mm: fix misuse of debug_kmap_atomic Impact: fix CONFIG_DEBUG_HIGHMEM=y breakage Commit 7ca43e756 ("mm: use debug_kmap_atomic") introduced some debug_kmap_atomic() calls in the wrong places. Signed-off-by: Akinobu Mita Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <20090402070126.GA3951@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 1 - arch/x86/mm/iomap_32.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5bc5d1688c1..8126e8d1a2a 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -40,7 +40,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) debug_kmap_atomic(type); - debug_kmap_atomic(type); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index bff0c9032f8..e331f77348a 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -39,6 +39,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) pagefault_disable(); + debug_kmap_atomic(type); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); @@ -72,7 +73,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - debug_kmap_atomic(type); /* * Force other mappings to Oops if they'll try to access this pte * without first remap it. Keeping stale mappings around is a bad idea -- cgit v1.2.3 From 6a491e2e3e52a64c6d88a192c56499d931842ac5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Apr 2009 16:44:38 -0700 Subject: x86: fix is_io_mapping_possible() build warning on i386 allnoconfig i386 allnoconfig: arch/x86/mm/iomap_32.c: In function 'is_io_mapping_possible': arch/x86/mm/iomap_32.c:27: warning: comparison is always false due to limited range of data type Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/mm/iomap_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index e331f77348a..8056545e2d3 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -23,7 +23,7 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) { -#ifndef CONFIG_X86_PAE +#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ if (base + size > 0x100000000ULL) return 0; -- cgit v1.2.3