diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/x86_64/mm/fault.c | 73 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 333 | ||||
-rw-r--r-- | arch/x86_64/mm/ioremap.c | 37 | ||||
-rw-r--r-- | arch/x86_64/mm/k8topology.c | 1 | ||||
-rw-r--r-- | arch/x86_64/mm/mmap.c | 30 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 189 | ||||
-rw-r--r-- | arch/x86_64/mm/pageattr.c | 9 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 73 |
9 files changed, 557 insertions, 190 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile index 1d232a87f11..d25ac86fe27 100644 --- a/arch/x86_64/mm/Makefile +++ b/arch/x86_64/mm/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init.o fault.o ioremap.o extable.o pageattr.o +obj-y := init.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index b75b872ec15..26eac194064 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -35,6 +35,13 @@ #include <asm-generic/sections.h> #include <asm/kdebug.h> +/* Page fault error code bits */ +#define PF_PROT (1<<0) /* or no page found */ +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; @@ -68,7 +75,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char *max_instr; /* If it was a exec fault ignore */ - if (error_code & (1<<4)) + if (error_code & PF_INSTR) return 0; instr = (unsigned char *)convert_rip_to_linear(current, regs); @@ -222,17 +229,22 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", current->comm, address); dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; __die("Bad pagetable", regs, error_code); oops_end(flags); do_exit(SIGKILL); } /* - * Handle a fault on the vmalloc or module mapping area + * Handle a fault on the vmalloc area * * This assumes no large pages in there. */ @@ -278,7 +290,6 @@ static int vmalloc_fault(unsigned long address) that. */ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) BUG(); - __flush_tlb_all(); return 0; } @@ -289,12 +300,6 @@ int exception_trace = 1; * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means fault was an instruction fetch */ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) @@ -308,18 +313,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long flags; siginfo_t info; -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("page_fault: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - /* get the address */ __asm__("movq %%cr2,%0":"=r" (address)); if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, @@ -349,12 +342,16 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, * * This verifies that the fault happens in kernel space * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 1) == 0. + * protection error (error_code & 9) == 0. */ if (unlikely(address >= TASK_SIZE64)) { - if (!(error_code & 5) && - ((address >= VMALLOC_START && address < VMALLOC_END) || - (address >= MODULES_VADDR && address < MODULES_END))) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { if (vmalloc_fault(address) < 0) goto bad_area_nosemaphore; return; @@ -366,7 +363,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (unlikely(error_code & (1 << 3))) + if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); /* @@ -393,7 +390,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, * thus avoiding the deadlock. */ if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && + if ((error_code & PF_USER) == 0 && !search_exception_tables(regs->rip)) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); @@ -420,17 +417,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, good_area: info.si_code = SEGV_ACCERR; write = 0; - switch (error_code & 3) { + switch (error_code & (PF_PROT|PF_WRITE)) { default: /* 3: write, present */ /* fall through */ - case 2: /* write, not present */ + case PF_WRITE: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; write++; break; - case 1: /* read, present */ + case PF_PROT: /* read, present */ goto bad_area; - case 0: /* read, not present */ + case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } @@ -465,7 +462,7 @@ bad_area: bad_area_nosemaphore: /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { + if (error_code & PF_USER) { if (is_prefetch(regs, address, error_code)) return; @@ -533,6 +530,9 @@ no_context: printk_address(regs->rip); printk("\n"); dump_pagetable(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; __die("Oops", regs, error_code); /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); @@ -558,7 +558,7 @@ do_sigbus: up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) + if (!(error_code & PF_USER)) goto no_context; tsk->thread.cr2 = address; @@ -571,3 +571,10 @@ do_sigbus: force_sig_info(SIGBUS, &info, tsk); return; } + +static int __init enable_pagefaulttrace(char *str) +{ + page_fault_trace = 1; + return 0; +} +__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index e60a1a848de..7af1742aa95 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,6 +22,10 @@ #include <linux/pagemap.h> #include <linux/bootmem.h> #include <linux/proc_fs.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/module.h> +#include <linux/memory_hotplug.h> #include <asm/processor.h> #include <asm/system.h> @@ -36,16 +40,18 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/smp.h> +#include <asm/sections.h> +#include <asm/dma-mapping.h> +#include <asm/swiotlb.h> #ifndef Dprintk #define Dprintk(x...) #endif -#ifdef CONFIG_GART_IOMMU -extern int swiotlb; -#endif +struct dma_mapping_ops* dma_ops; +EXPORT_SYMBOL(dma_ops); -extern char _stext[]; +static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -86,9 +92,6 @@ void show_mem(void) /* References to section boundaries */ -extern char _text, _etext, _edata, __bss_start, _end[]; -extern char __init_begin, __init_end; - int after_bootmem; static void *spp_getpage(void) @@ -179,13 +182,19 @@ static struct temp_map { {} }; -static __init void *alloc_low_page(int *index, unsigned long *phys) +static __meminit void *alloc_low_page(int *index, unsigned long *phys) { struct temp_map *ti; int i; unsigned long pfn = table_end++, paddr; void *adr; + if (after_bootmem) { + adr = (void *)get_zeroed_page(GFP_ATOMIC); + *phys = __pa(adr); + return adr; + } + if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); for (i = 0; temp_mappings[i].allocated; i++) { @@ -198,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys) ti->allocated = 1; __flush_tlb(); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + memset(adr, 0, PAGE_SIZE); *index = i; *phys = pfn * PAGE_SIZE; return adr; } -static __init void unmap_low_page(int i) +static __meminit void unmap_low_page(int i) { - struct temp_map *ti = &temp_mappings[i]; + struct temp_map *ti; + + if (after_bootmem) + return; + + ti = &temp_mappings[i]; set_pmd(ti->pmd, __pmd(0)); ti->allocated = 0; } -static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void __meminit +phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +{ + int i; + + for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + unsigned long entry; + + if (address > end) { + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); + break; + } + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; + entry &= __supported_pte_mask; + set_pmd(pmd, __pmd(entry)); + } +} + +static void __meminit +phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); + + if (pmd_none(*pmd)) { + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); + } +} + +static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { - long i, j; + long i = pud_index(address); - i = pud_index(address); pud = pud + i; + + if (after_bootmem && pud_val(*pud)) { + phys_pmd_update(pud, address, end); + return; + } + for (; i < PTRS_PER_PUD; pud++, i++) { int map; unsigned long paddr, pmd_phys; pmd_t *pmd; - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); + paddr = (address & PGDIR_MASK) + i*PUD_SIZE; + if (paddr >= end) break; - } - if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } pmd = alloc_low_page(&map, &pmd_phys); + spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { - unsigned long pe; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr; - pe &= __supported_pte_mask; - set_pmd(pmd, __pmd(pe)); - } + phys_pmd_init(pmd, paddr, end); + spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } __flush_tlb(); @@ -254,25 +294,32 @@ static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned lon static void __init find_early_table_space(unsigned long end) { - unsigned long puds, pmds, tables; + unsigned long puds, pmds, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables); + /* RED-PEN putting page tables only on node 0 could + cause a hotspot and fill up ZONE_DMA. The page tables + need roughly 0.5KB per GB. */ + start = 0x8000; + table_start = find_e820_area(start, end, tables); if (table_start == -1UL) panic("Cannot find space for the kernel page tables"); table_start >>= PAGE_SHIFT; table_end = table_start; + + early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", + end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); } /* Setup the direct mapping of the physical memory at PAGE_OFFSET. This runs before bootmem is initialized and gets pages directly from the physical memory. To access them they are temporarily mapped. */ -void __init init_memory_mapping(unsigned long start, unsigned long end) +void __meminit init_memory_mapping(unsigned long start, unsigned long end) { unsigned long next; @@ -284,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end) * mapped. Unfortunately this is done currently before the nodes are * discovered. */ - find_early_table_space(end); + if (!after_bootmem) + find_early_table_space(end); start = (unsigned long)__va(start); end = (unsigned long)__va(end); @@ -292,58 +340,106 @@ void __init init_memory_mapping(unsigned long start, unsigned long end) for (; start < end; start = next) { int map; unsigned long pud_phys; - pud_t *pud = alloc_low_page(&map, &pud_phys); + pgd_t *pgd = pgd_offset_k(start); + pud_t *pud; + + if (after_bootmem) + pud = pud_offset_k(pgd, __PAGE_OFFSET); + else + pud = alloc_low_page(&map, &pud_phys); + next = start + PGDIR_SIZE; if (next > end) next = end; phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); + if (!after_bootmem) + set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(map); } - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + if (!after_bootmem) + asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); __flush_tlb_all(); - early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<<PAGE_SHIFT, - table_end<<PAGE_SHIFT); } -extern struct x8664_pda cpu_pda[NR_CPUS]; +void __cpuinit zap_low_mappings(int cpu) +{ + if (cpu == 0) { + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + } else { + /* + * For AP's, zap the low identity mappings by changing the cr3 + * to init_level4_pgt and doing local flush tlb all + */ + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); + } + __flush_tlb_all(); +} -/* Assumes all CPUs still execute in init_mm */ -void zap_low_mappings(void) +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ +__init void +size_zones(unsigned long *z, unsigned long *h, + unsigned long start_pfn, unsigned long end_pfn) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - flush_tlb_all(); + int i; + unsigned long w; + + for (i = 0; i < MAX_NR_ZONES; i++) + z[i] = 0; + + if (start_pfn < MAX_DMA_PFN) + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; + if (start_pfn < MAX_DMA32_PFN) { + unsigned long dma32_pfn = MAX_DMA32_PFN; + if (dma32_pfn > end_pfn) + dma32_pfn = end_pfn; + z[ZONE_DMA32] = dma32_pfn - start_pfn; + } + z[ZONE_NORMAL] = end_pfn - start_pfn; + + /* Remove lower zones from higher ones. */ + w = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + if (z[i]) + z[i] -= w; + w += z[i]; + } + + /* Compute holes */ + w = start_pfn; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long s = w; + w += z[i]; + h[i] = e820_hole_size(s, w); + } + + /* Add the space pace needed for mem_map to the holes too. */ + for (i = 0; i < MAX_NR_ZONES; i++) + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; + + /* The 16MB DMA zone has the kernel and other misc mappings. + Account them too */ + if (h[ZONE_DMA]) { + h[ZONE_DMA] += dma_reserve; + if (h[ZONE_DMA] >= z[ZONE_DMA]) { + printk(KERN_WARNING + "Kernel too large and filling up ZONE_DMA?\n"); + h[ZONE_DMA] = z[ZONE_DMA]; + } + } } #ifndef CONFIG_NUMA void __init paging_init(void) { - { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long holes[MAX_NR_ZONES]; - unsigned int max_dma; - - memset(zones_size, 0, sizeof(zones_size)); - memset(holes, 0, sizeof(holes)); - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - if (end_pfn < max_dma) { - zones_size[ZONE_DMA] = end_pfn; - holes[ZONE_DMA] = e820_hole_size(0, end_pfn); - } else { - zones_size[ZONE_DMA] = max_dma; - holes[ZONE_DMA] = e820_hole_size(0, max_dma); - zones_size[ZONE_NORMAL] = end_pfn - max_dma; - holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); - } - free_area_init_node(0, NODE_DATA(0), zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); - } - return; + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; + + memory_present(0, 0, end_pfn); + sparse_init(); + size_zones(zones, holes, 0, end_pfn); + free_area_init_node(0, NODE_DATA(0), zones, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); } #endif @@ -381,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) __flush_tlb_all(); } +/* + * Memory hotplug specific functions + * These are only for non-NUMA machines right now. + */ +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + totalram_pages++; + num_physpages++; +} + +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdat = NODE_DATA(0); + struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + int ret; + + ret = __add_pages(zone, start_pfn, nr_pages); + if (ret) + goto error; + + init_memory_mapping(start, (start + size -1)); + + return ret; +error: + printk("%s: Problem encountered in __add_pages!\n", __func__); + return ret; +} +EXPORT_SYMBOL_GPL(add_memory); + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(remove_memory); + +#endif + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -389,12 +529,9 @@ void __init mem_init(void) long codesize, reservedpages, datasize, initsize; #ifdef CONFIG_SWIOTLB - if (!iommu_aperture && - (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) - swiotlb = 1; - if (swiotlb) - swiotlb_init(); + pci_swiotlb_init(); #endif + no_iommu_init(); /* How many end-of-memory variables you have, grandma! */ max_low_pfn = end_pfn; @@ -438,19 +575,16 @@ void __init mem_init(void) datasize >> 10, initsize >> 10); +#ifdef CONFIG_SMP /* - * Subtle. SMP is doing its boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. + * Sync boot_level4_pgt mappings with the init_level4_pgt + * except for the low identity mappings which are already zapped + * in init_level4_pgt. This sync-up is essential for AP's bringup */ -#ifndef CONFIG_SMP - zap_low_mappings(); + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); #endif } -extern char __initdata_begin[], __initdata_end[]; - void free_initmem(void) { unsigned long addr; @@ -464,13 +598,36 @@ void free_initmem(void) totalram_pages++; } memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); + printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); } +#ifdef CONFIG_DEBUG_RODATA + +extern char __start_rodata, __end_rodata; +void mark_rodata_ro(void) +{ + unsigned long addr = (unsigned long)&__start_rodata; + + for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE) + change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); + + printk ("Write protecting the kernel read-only data: %luk\n", + (&__end_rodata - &__start_rodata) >> 10); + + /* + * change_page_attr_addr() requires a global_flush_tlb() call after it. + * We do this after the printk so that if something went wrong in the + * change, the printk gets out at least to give a better debug hint + * of who is the culprit. + */ + global_flush_tlb(); +} +#endif + #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start < (unsigned long)&_end) + if (start >= end) return; printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { @@ -491,6 +648,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) #else reserve_bootmem(phys, len); #endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) + dma_reserve += len / PAGE_SIZE; } int kern_addr_valid(unsigned long addr) @@ -532,10 +691,6 @@ extern int exception_trace, page_fault_trace; static ctl_table debug_table2[] = { { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, proc_dointvec }, -#ifdef CONFIG_CHECKING - { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, - proc_dointvec }, -#endif { 0, } }; diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index ecf7acb5db9..ae207064201 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -247,9 +247,15 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) return __ioremap(phys_addr, size, _PAGE_PCD); } +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ void iounmap(volatile void __iomem *addr) { - struct vm_struct *p; + struct vm_struct *p, *o; if (addr <= high_memory) return; @@ -257,12 +263,31 @@ void iounmap(volatile void __iomem *addr) addr < phys_to_virt(ISA_END_ADDRESS)) return; - write_lock(&vmlist_lock); - p = __remove_vm_area((void *)((unsigned long)addr & PAGE_MASK)); - if (!p) + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { printk("iounmap: bad address %p\n", addr); - else if (p->flags >> 20) + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + if (p->flags >> 20) ioremap_change_attr(p->phys_addr, p->size, 0); - write_unlock(&vmlist_lock); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); kfree(p); } diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 65417b040c1..a5663e0bb01 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) limit >>= 16; limit <<= 24; limit |= (1<<24)-1; + limit++; if (limit > end_pfn << PAGE_SHIFT) limit = end_pfn << PAGE_SHIFT; diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c new file mode 100644 index 00000000000..43e9b99bdf2 --- /dev/null +++ b/arch/x86_64/mm/mmap.c @@ -0,0 +1,30 @@ +/* Copyright 2005 Andi Kleen, SuSE Labs. + * Licensed under GPL, v.2 + */ +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/random.h> +#include <asm/ia32.h> + +/* Notebook: move the mmap code from sys_x86_64.c over here. */ + +void arch_pick_mmap_layout(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->flags & _TIF_IA32) + return ia32_pick_mmap_layout(mm); +#endif + mm->mmap_base = TASK_UNMAPPED_BASE; + if (current->flags & PF_RANDOMIZE) { + /* Add 28bit randomness which is about 40bits of address space + because mmap base has to be page aligned. + or ~1/128 of the total user VM + (total user address space is 47bits) */ + unsigned rnd = get_random_int() & 0xfffffff; + mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; + } + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} + diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 21480382100..6ef9f9a7623 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -38,38 +38,59 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; -int __init compute_hash_shift(struct node *nodes, int numnodes) + +/* + * Given a shift value, try to populate memnodemap[] + * Returns : + * 1 if OK + * 0 if memnodmap[] too small (of shift too small) + * -1 if node overlap or lost ram (shift too big) + */ +static int __init +populate_memnodemap(const struct node *nodes, int numnodes, int shift) { int i; - int shift = 20; - unsigned long addr,maxend=0; - - for (i = 0; i < numnodes; i++) - if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend)) - maxend = nodes[i].end; + int res = -1; + unsigned long addr, end; - while ((1UL << shift) < (maxend / NODEMAPSIZE)) - shift++; - - printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n", - shift,maxend); - memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE); + if (shift >= 64) + return -1; + memset(memnodemap, 0xff, sizeof(memnodemap)); for (i = 0; i < numnodes; i++) { - if (nodes[i].start == nodes[i].end) + addr = nodes[i].start; + end = nodes[i].end; + if (addr >= end) continue; - for (addr = nodes[i].start; - addr < nodes[i].end; - addr += (1UL << shift)) { - if (memnodemap[addr >> shift] != 0xff) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d adder=%lu\n", - shift,addr); + if ((end >> shift) >= NODEMAPSIZE) + return 0; + do { + if (memnodemap[addr >> shift] != 0xff) return -1; - } memnodemap[addr >> shift] = i; - } + addr += (1UL << shift); + } while (addr < end); + res = 1; } + return res; +} + +int __init compute_hash_shift(struct node *nodes, int numnodes) +{ + int shift = 20; + + while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) + shift++; + + printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", + shift); + + if (populate_memnodemap(nodes, numnodes, shift) != 1) { + printk(KERN_INFO + "Your memory is not aligned you need to rebuild your kernel " + "with a bigger NODEMAPSIZE shift=%d\n", + shift); + return -1; + } return shift; } @@ -89,12 +110,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start = round_up(start, ZONE_ALIGN); - printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -132,29 +152,14 @@ void __init setup_node_zones(int nodeid) unsigned long start_pfn, end_pfn; unsigned long zones[MAX_NR_ZONES]; unsigned long holes[MAX_NR_ZONES]; - unsigned long dma_end_pfn; - memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); - memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES); + start_pfn = node_start_pfn(nodeid); + end_pfn = node_end_pfn(nodeid); - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); + Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", + nodeid, start_pfn, end_pfn); - Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); - - /* All nodes > 0 have a zero length zone DMA */ - dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; - if (start_pfn < dma_end_pfn) { - zones[ZONE_DMA] = dma_end_pfn - start_pfn; - holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn); - zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; - holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn); - - } else { - zones[ZONE_NORMAL] = end_pfn - start_pfn; - holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn); - } - + size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, start_pfn, holes); } @@ -171,7 +176,7 @@ void __init numa_init_array(void) for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] != NUMA_NO_NODE) continue; - cpu_to_node[i] = rr; + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); @@ -195,7 +200,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) while ((x << 1) < sz) x <<= 1; if (x < sz/2) - printk("Numa emulation unbalanced. Complain to maintainer\n"); + printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); sz = x; } @@ -205,8 +210,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) if (i == numa_fake-1) sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; nodes[i].end = nodes[i].start + sz; - if (i != numa_fake-1) - nodes[i].end--; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -257,7 +260,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) nodes_clear(node_online_map); node_set_online(0); for (i = 0; i < NR_CPUS; i++) - cpu_to_node[i] = 0; + numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -267,6 +270,12 @@ __cpuinit void numa_add_cpu(int cpu) set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); } +void __cpuinit numa_set_node(int cpu, int node) +{ + cpu_pda(cpu)->nodenumber = node; + cpu_to_node[cpu] = node; +} + unsigned long __init numa_free_all_bootmem(void) { int i; @@ -277,9 +286,26 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } +#ifdef CONFIG_SPARSEMEM +static void __init arch_sparse_init(void) +{ + int i; + + for_each_online_node(i) + memory_present(i, node_start_pfn(i), node_end_pfn(i)); + + sparse_init(); +} +#else +#define arch_sparse_init() do {} while (0) +#endif + void __init paging_init(void) { int i; + + arch_sparse_init(); + for_each_online_node(i) { setup_node_zones(i); } @@ -304,8 +330,69 @@ __init int numa_setup(char *opt) return 1; } +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + */ +void __init init_cpu_to_node(void) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + u8 apicid = x86_cpu_to_apicid[i]; + if (apicid == BAD_APICID) + continue; + if (apicid_to_node[apicid] == NUMA_NO_NODE) + continue; + cpu_to_node[i] = apicid_to_node[apicid]; + } +} + EXPORT_SYMBOL(cpu_to_node); EXPORT_SYMBOL(node_to_cpumask); EXPORT_SYMBOL(memnode_shift); EXPORT_SYMBOL(memnodemap); EXPORT_SYMBOL(node_data); + +#ifdef CONFIG_DISCONTIGMEM +/* + * Functions to convert PFNs from/to per node page addresses. + * These are out of line because they are quite big. + * They could be all tuned by pre caching more state. + * Should do that. + */ + +/* Requires pfn_valid(pfn) to be true */ +struct page *pfn_to_page(unsigned long pfn) +{ + int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); + return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; +} +EXPORT_SYMBOL(pfn_to_page); + +unsigned long page_to_pfn(struct page *page) +{ + return (long)(((page) - page_zone(page)->zone_mem_map) + + page_zone(page)->zone_start_pfn); +} +EXPORT_SYMBOL(page_to_pfn); + +int pfn_valid(unsigned long pfn) +{ + unsigned nid; + if (pfn >= num_physpages) + return 0; + nid = pfn_to_nid(pfn); + if (nid == 0xff) + return 0; + return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); +} +EXPORT_SYMBOL(pfn_valid); +#endif diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index b90e8fe9eeb..35f1f1aab06 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, pte_t *kpte; struct page *kpte_page; unsigned kpte_flags; + pgprot_t ref_prot2; kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); @@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, * split_large_page will take the reference for this change_page_attr * on the split page. */ - struct page *split = split_large_page(address, prot, ref_prot); + + struct page *split; + ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); + + split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot)); + set_pte(kpte,mk_pte(split, ref_prot2)); kpte_page = split; } get_page(kpte_page); diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 4b2e844c15a..8b7f85608fa 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -17,21 +17,23 @@ #include <linux/topology.h> #include <asm/proto.h> #include <asm/numa.h> +#include <asm/e820.h> static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct node nodes[MAX_NUMNODES] __initdata; -static __u8 pxm2node[256] = { [0 ... 255] = 0xff }; +static u8 pxm2node[256] = { [0 ... 255] = 0xff }; static int node_to_pxm(int n); int pxm_to_node(int pxm) { if ((unsigned)pxm >= 256) - return 0; - return pxm2node[pxm]; + return -1; + /* Extend 0xff to (int)-1 */ + return (signed char)pxm2node[pxm]; } static __init int setup_node(int pxm) @@ -71,8 +73,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end) nd->start = nd->end; } if (nd->end > end) { - if (!(end & 0xfff)) - end--; nd->end = end; if (nd->start > nd->end) nd->start = nd->end; @@ -93,9 +93,36 @@ static __init inline int srat_disabled(void) return numa_off || acpi_numa < 0; } +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static __init int slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->localities; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != 10) + return 0; + } else if (val <= 10) + return 0; + } + } + return 1; +} + /* Callback for SLIT parsing */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { + if (!slit_valid(slit)) { + printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n"); + return; + } acpi_slit = slit; } @@ -166,18 +193,43 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } - if (!(nd->end & 0xfff)) - nd->end--; printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); } +/* Sanity check to catch more bad SRATs (they are amazingly common). + Make sure the PXMs cover all memory. */ +static int nodes_cover_memory(void) +{ + int i; + unsigned long pxmram, e820ram; + + pxmram = 0; + for_each_node_mask(i, nodes_parsed) { + unsigned long s = nodes[i].start >> PAGE_SHIFT; + unsigned long e = nodes[i].end >> PAGE_SHIFT; + pxmram += e - s; + pxmram -= e820_hole_size(s, e); + } + + e820ram = end_pfn - e820_hole_size(0, end_pfn); + if (pxmram < e820ram) { + printk(KERN_ERR + "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", + (pxmram << PAGE_SHIFT) >> 20, + (e820ram << PAGE_SHIFT) >> 20); + return 0; + } + return 1; +} + void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; + if (acpi_numa <= 0) return -1; @@ -188,6 +240,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) node_clear(i, nodes_parsed); } + if (!nodes_cover_memory()) { + bad_srat(); + return -1; + } + memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed)); if (memnode_shift < 0) { printk(KERN_ERR @@ -203,7 +260,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (cpu_to_node[i] == NUMA_NO_NODE) continue; if (!node_isset(cpu_to_node[i], nodes_parsed)) - cpu_to_node[i] = NUMA_NO_NODE; + numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); return 0; |