diff options
Diffstat (limited to 'arch/x86/mm')
27 files changed, 3286 insertions, 3330 deletions
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 index 362b4ad082d..c36ae88bb54 100644 --- a/arch/x86/mm/Makefile_32 +++ b/arch/x86/mm/Makefile_32 @@ -2,9 +2,8 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o +obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_NUMA) += discontig_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 index 6bcb47945b8..688c8c28ac8 100644 --- a/arch/x86/mm/Makefile_64 +++ b/arch/x86/mm/Makefile_64 @@ -2,9 +2,8 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o +obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa_64.o obj-$(CONFIG_K8_NUMA) += k8topology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_64.o - diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c deleted file mode 100644 index f14da2a53ec..00000000000 --- a/arch/x86/mm/boot_ioremap_32.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * arch/i386/mm/boot_ioremap.c - * - * Re-map functions for early boot-time before paging_init() when the - * boot-time pagetables are still in use - * - * Written by Dave Hansen <haveblue@us.ibm.com> - */ - - -/* - * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE - * keeps that from happening. If anyone has a better way, I'm listening. - * - * boot_pte_t is defined only if this all works correctly - */ - -#undef CONFIG_X86_PAE -#undef CONFIG_PARAVIRT -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/init.h> -#include <linux/stddef.h> - -/* - * I'm cheating here. It is known that the two boot PTE pages are - * allocated next to each other. I'm pretending that they're just - * one big array. - */ - -#define BOOT_PTE_PTRS (PTRS_PER_PTE*2) - -static unsigned long boot_pte_index(unsigned long vaddr) -{ - return __pa(vaddr) >> PAGE_SHIFT; -} - -static inline boot_pte_t* boot_vaddr_to_pte(void *address) -{ - boot_pte_t* boot_pg = (boot_pte_t*)pg0; - return &boot_pg[boot_pte_index((unsigned long)address)]; -} - -/* - * This is only for a caller who is clever enough to page-align - * phys_addr and virtual_source, and who also has a preference - * about which virtual address from which to steal ptes - */ -static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, - void* virtual_source) -{ - boot_pte_t* pte; - int i; - char *vaddr = virtual_source; - - pte = boot_vaddr_to_pte(virtual_source); - for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { - set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); - __flush_tlb_one(&vaddr[i*PAGE_SIZE]); - } -} - -/* the virtual space we're going to remap comes from this array */ -#define BOOT_IOREMAP_PAGES 4 -#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) -static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); - -/* - * This only applies to things which need to ioremap before paging_init() - * bt_ioremap() and plain ioremap() are both useless at this point. - * - * When used, we're still using the boot-time pagetables, which only - * have 2 PTE pages mapping the first 8MB - * - * There is no unmap. The boot-time PTE pages aren't used after boot. - * If you really want the space back, just remap it yourself. - * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) - */ -__init void* boot_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr, offset; - unsigned int nrpages; - - last_addr = phys_addr + size - 1; - - /* page align the requested address */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - nrpages = size >> PAGE_SHIFT; - if (nrpages > BOOT_IOREMAP_PAGES) - return NULL; - - __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); - - return &boot_ioremap_space[offset]; -} diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 13a474d3c6e..04b1d20e261 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -32,6 +32,7 @@ #include <linux/kexec.h> #include <linux/pfn.h> #include <linux/swap.h> +#include <linux/acpi.h> #include <asm/e820.h> #include <asm/setup.h> @@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; static unsigned long kva_start_pfn; static unsigned long kva_pages; /* @@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid) } } +#ifdef CONFIG_DISCONTIGMEM +/* + * In the discontig memory model, a portion of the kernel virtual area (KVA) + * is reserved and portions of nodes are mapped using it. This is to allow + * node-local memory to be allocated for structures that would normally require + * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers + * should be prepared to allocate from the bootmem allocator instead. This KVA + * mechanism is incompatible with SPARSEMEM as it makes assumptions about the + * layout of memory that are broken if alloc_remap() succeeds for some of the + * map and fails for others + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; + void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; @@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void) return reserve_pages; } +static void init_remap_allocator(int nid) +{ + node_remap_start_vaddr[nid] = pfn_to_kaddr( + kva_start_pfn + node_remap_offset[nid]); + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(highstart_pfn + + node_remap_offset[nid] + node_remap_size[nid])); +} +#else +void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} + +static unsigned long calculate_numa_remap_pages(void) +{ + return 0; +} + +static void init_remap_allocator(int nid) +{ +} + +void __init remap_numa_kva(void) +{ +} +#endif /* CONFIG_DISCONTIGMEM */ + extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -288,11 +336,18 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD /* Numa kva area is below the initrd */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) - kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) + if (initrd_start) + kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - kva_pages; #endif - kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); + + /* + * We waste pages past at the end of the KVA for no good reason other + * than how it is located. This is bad. + */ + wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); + kva_start_pfn -= wasted_pages; + kva_pages += wasted_pages; system_max_low_pfn = max_low_pfn = find_max_low_pfn(); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", @@ -318,19 +373,9 @@ unsigned long __init setup_memory(void) printk("Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) { - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - /* Init the node remap allocator */ - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); + init_remap_allocator(nid); allocate_pgdat(nid); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); } printk("High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); @@ -345,7 +390,8 @@ unsigned long __init setup_memory(void) void __init numa_kva_reserve(void) { - reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); + if (kva_pages) + reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages)); } void __init zone_sizes_init(void) @@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT +/* + * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA + * + * These stub functions are needed to compile 32-bit NUMA when SRAT is + * not set. There are functions in srat_64.c for parsing this table + * and it may be possible to make them common functions. + */ +void acpi_numa_slit_init (struct acpi_table_slit *slit) +{ + printk(KERN_INFO "ACPI: No support for parsing SLIT table\n"); +} + +void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa) +{ +} + +void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} +#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 00000000000..7e8db53528a --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,62 @@ +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> + + +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + fixup = search_exception_tables(regs->ip); + if (fixup) { + regs->ip = fixup->fixup; + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Need to defined our own search_extable on X86_64 to work around + * a B stepping K8 bug. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + /* B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + + while (first <= last) { + const struct exception_table_entry *mid; + long diff; + + mid = (last - first) / 2 + first; + diff = mid->insn - value; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid+1; + else + last = mid-1; + } + return NULL; +} +#endif diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c deleted file mode 100644 index 0ce4f22a263..00000000000 --- a/arch/x86/mm/extable_32.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * linux/arch/i386/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <asm/uaccess.h> - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - -#ifdef CONFIG_PNPBIOS - if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) - { - extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; - extern u32 pnp_bios_is_utter_crap; - pnp_bios_is_utter_crap = 1; - printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); - __asm__ volatile( - "movl %0, %%esp\n\t" - "jmp *%1\n\t" - : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); - panic("do_trap: can't hit this"); - } -#endif - - fixup = search_exception_tables(regs->eip); - if (fixup) { - regs->eip = fixup->fixup; - return 1; - } - - return 0; -} diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c deleted file mode 100644 index 79ac6e7100a..00000000000 --- a/arch/x86/mm/extable_64.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * linux/arch/x86_64/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <asm/uaccess.h> - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* Work around a B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 00000000000..e28cc5277b1 --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,986 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/compiler.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> + +#include <asm/system.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm-generic/sections.h> + +/* + * Page fault error code bits + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch + */ +#define PF_PROT (1<<0) +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +static inline int notify_page_fault(struct pt_regs *regs) +{ +#ifdef CONFIG_KPROBES + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +#ifdef CONFIG_X86_32 + if (!user_mode_vm(regs)) { +#else + if (!user_mode(regs)) { +#endif + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +#else + return 0; +#endif +} + +/* + * X86_32 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * X86_64 + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner + */ +static int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + +#ifdef CONFIG_X86_32 + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; +#endif + + /* If it was a exec fault on NX page, ignore */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + scan_more = ((instr_lo & 7) == 0x6); + break; +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + + if (probe_kernel_address(instr, opcode)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + +#ifdef CONFIG_X86_64 +static int bad_address(void *p) +{ + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); +} +#endif + +void dump_pagetable(unsigned long address) +{ +#ifdef CONFIG_X86_32 + __typeof__(pte_val(__pte(0))) page; + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_CONT "*pde = %016Lx ", page); + page &= ~_PAGE_NX; + } +#else + printk("*pde = %08lx ", page); +#endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); + } + + printk("\n"); +#else /* CONFIG_X86_64 */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +#endif +} + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} +#endif + +#ifdef CONFIG_X86_64 +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. + Does nothing for X86_32 + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + static int warned; + if (address != regs->ip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal + * addresses >4GB. We catch this in the page fault handler because these + * addresses are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return 1; +#endif + return 0; +} + +void do_invalid_op(struct pt_regs *, unsigned long); + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_X86_32 + if (!oops_may_print()) + return; +#endif + +#ifdef CONFIG_X86_PAE + if (error_code & PF_INSTR) { + int level; + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } +#endif + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); +#ifdef CONFIG_X86_32 + printk(KERN_CONT " at %08lx\n", address); +#else + printk(KERN_CONT " at %016lx\n", address); +#endif + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + dump_pagetable(address); +} + +#ifdef CONFIG_X86_64 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) + regs = NULL; + oops_end(flags, regs, SIGKILL); +} +#endif + +/* + * Handle a spurious fault caused by a stale TLB entry. This allows + * us to lazily refresh the TLB when increasing the permissions of a + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very + * expensive since that implies doing a full cross-processor TLB + * flush, even if no stale TLB entries exist on other processors. + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static int spurious_fault(unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * X86_32 + * Handle a fault on the vmalloc or module mapping area + * + * X86_64 + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_32 + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +#else + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +#endif +} + +int show_unhandled_signals = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long address; + int write, si_code; + int fault; +#ifdef CONFIG_X86_64 + unsigned long flags; +#endif + + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = read_cr2(); + + si_code = SEGV_MAPERR; + + if (notify_page_fault(regs)) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ +#ifdef CONFIG_X86_32 + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + vmalloc_fault(address) >= 0) + return; + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; +#else /* CONFIG_X86_64 */ + if (unlikely(address >= TASK_SIZE64)) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + if (likely(regs->flags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; +again: +#endif + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535,$31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + +#ifdef CONFIG_X86_32 +survive: +#endif + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + +#ifdef CONFIG_X86_32 + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (v8086_mode(regs)) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } +#endif + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( +#ifdef CONFIG_X86_32 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", +#else + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", +#endif + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->ip, + regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + if (__die("Oops", regs, error_code)) + regs = NULL; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, SIGKILL); +#endif + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_global_init(tsk)) { + yield(); +#ifdef CONFIG_X86_32 + down_read(&mm->mmap_sem); + goto survive; +#else + goto again; +#endif + } + + printk("VM: killing process %s\n", tsk->comm); + if (error_code & PF_USER) + do_group_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ +#ifdef CONFIG_X86_32 + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = TASK_SIZE; + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(pgd_index(address), insync); + } + if (address == start && test_bit(pgd_index(address), insync)) + start = address + PGDIR_SIZE; + } +#else /* CONFIG_X86_64 */ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +#endif +} diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c deleted file mode 100644 index a2273d44aa2..00000000000 --- a/arch/x86/mm/fault_32.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/bootmem.h> /* for max_low_pfn */ -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/desc.h> -#include <asm/segment.h> - -extern void die(const char *,struct pt_regs *,long); - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* Unlikely, but must come before segment checks. */ - if (unlikely(regs->eflags & VM_MASK)) { - base = seg << 4; - *eip_limit = base + 0xffff; - return base + (eip & 0xffff); - } - - /* The standard kernel/user address space limit. */ - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; - - /* By far the most common cases. */ - if (likely(SEGMENT_IS_FLAT_CODE(seg))) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - mutex_lock(¤t->mm->context.lock); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - mutex_unlock(¤t->mm->context.lock); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} - -/* - * Handle a fault on the vmalloc or module mapping area - * - * This assumes no large pages in there. - */ -static inline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - int write, si_code; - int fault; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - /* get the address */ - address = read_cr2(); - - tsk = current; - - si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) - return; - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * Accessing the stack below %esp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes - * 32 pointers and then decrements %esp by 65535.) - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk("%s%s[%d]: segfault at %08lx eip %08lx " - "esp %08lx error %lx\n", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->eip, - regs->esp, error_code); - } - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (oops_may_print()) { - __typeof__(pte_val(__pte(0))) page; - -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); - } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL " - "pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging" - " request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT "printing eip: %08lx ", regs->eip); - - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; -#ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); -#endif - - /* - * We must not directly access the pte in the highpte - * case if the page table is located in highmem. - * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. - */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } - - printk("\n"); - } - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -void vmalloc_sync_all(void) -{ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { - BUG_ON(page != pgd_list); - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); - } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; - } -} diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c deleted file mode 100644 index 0e26230669c..00000000000 --- a/arch/x86/mm/fault_64.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm-generic/sections.h> - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 1c3bf95f735..3d936f23270 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -18,6 +18,49 @@ void kunmap(struct page *page) kunmap_high(page); } +static void debug_kmap_atomic_prot(enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +#endif +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + + debug_kmap_atomic_prot(type); + pagefault_disable(); if (!PageHighMem(page)) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 6c06d9c0488..4fbafb4bc2f 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -15,6 +15,7 @@ #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, @@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_lock(&mm->page_table_lock); if (pud_none(*pud)) - pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); + pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3c76d194fd2..da524fb2242 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -27,7 +27,6 @@ #include <linux/bootmem.h> #include <linux/slab.h> #include <linux/proc_fs.h> -#include <linux/efi.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> @@ -40,8 +39,10 @@ #include <asm/fixmap.h> #include <asm/e820.h> #include <asm/apic.h> +#include <asm/bugs.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/paravirt.h> @@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; -static int noinline do_test_wp_bit(void); +static noinline int do_test_wp_bit(void); /* * Creates a middle page table and puts a pointer to it in the @@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); + BUG_ON(pmd_table != pmd_offset(pud, 0)); } #endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); + return pmd_table; } /* * Create a page table and place a pointer to it in a middle page - * directory entry. + * directory entry: */ static pte_t * __init one_page_table_init(pmd_t *pmd) { @@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) + if (!page_table) { page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) } /* - * This function initializes a certain range of kernel virtual memory + * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without * checking the pgd every time. */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); pmd = pmd + pmd_index(vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr) } /* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) { + int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd = one_md_table_init(pgd); if (pfn >= max_low_pfn) continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - /* Map with big pages if possible, otherwise create normal page tables. */ + for (pmd_idx = 0; + pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } + continue; + } + pte = one_page_table_init(pmd); + + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(pfn, prot)); } } } @@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} static void __init kmap_init(void) { unsigned long kmap_vstart; - /* cache the first kmap pte */ + /* + * Cache the first kmap pte: + */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); @@ -259,11 +241,11 @@ static void __init kmap_init(void) static void __init permanent_kmaps_init(pgd_t *pgd_base) { + unsigned long vaddr; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr; vaddr = PKMAP_BASE; page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); @@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = pte; } static void __meminit free_new_highpage(struct page *page) @@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) +static int __meminit +add_one_highpage_hotplug(struct page *page, unsigned long pfn) { free_new_highpage(page); totalram_pages++; @@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p max_mapnr = max(pfn, max_mapnr); #endif num_physpages++; + return 0; } @@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p * Not currently handling the NUMA case. * Assuming single node and all memory that * has been added dynamically that would be - * onlined here is in HIGHMEM + * onlined here is in HIGHMEM. */ void __meminit online_page(struct page *page) { @@ -314,13 +298,11 @@ void __meminit online_page(struct page *page) add_one_highpage_hotplug(page, page_to_pfn(page)); } - -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else +#ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { /* * Holes under sparsemem might not have no mem_map[]: @@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro) } totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ +#endif /* !CONFIG_NUMA */ #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) +# define kmap_init() do { } while (0) +# define permanent_kmaps_init(pgd_base) do { } while (0) +# define set_highmem_pages_init(bad_ppro) do { } while (0) #endif /* CONFIG_HIGHMEM */ -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +pteval_t __PAGE_KERNEL = _PAGE_KERNEL; EXPORT_SYMBOL(__PAGE_KERNEL); -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { @@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base) memset(&base[USER_PTRS_PER_PGD], 0, KERNEL_PGD_PTRS * sizeof(pgd_t)); #else - paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); #endif } @@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init (void) +static void __init pagetable_init(void) { - unsigned long vaddr, end; pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; paravirt_pagetable_setup_start(pgd_base); @@ -435,9 +412,11 @@ static void __init pagetable_init (void) * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ + early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); permanent_kmaps_init(pgd_base); @@ -450,7 +429,7 @@ static void __init pagetable_init (void) * driver might have split up a kernel 4MB mapping. */ char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); + __attribute__ ((aligned(PAGE_SIZE))); static inline void save_pg_dir(void) { @@ -462,7 +441,7 @@ static inline void save_pg_dir(void) } #endif -void zap_low_mappings (void) +void zap_low_mappings(void) { int i; @@ -474,22 +453,24 @@ void zap_low_mappings (void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) + for (i = 0; i < USER_PTRS_PER_PGD; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); #endif + } flush_tlb_all(); } -int nx_enabled = 0; +int nx_enabled; + +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE -static int disable_nx __initdata = 0; -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; -EXPORT_SYMBOL_GPL(__supported_pte_mask); +static int disable_nx __initdata; /* * noexec = on|off @@ -506,11 +487,14 @@ static int __init noexec_setup(char *str) __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else if (!strcmp(str,"off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else - return -EINVAL; + } else { + if (!strcmp(str, "off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } else { + return -EINVAL; + } + } return 0; } @@ -522,6 +506,7 @@ static void __init set_nx(void) if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { rdmsr(MSR_EFER, l, h); l |= EFER_NX; @@ -531,35 +516,6 @@ static void __init set_nx(void) } } } - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - pte_update_defer(&init_mm, vaddr, pte); - __flush_tlb_all(); -out: - return ret; -} - #endif /* @@ -574,9 +530,8 @@ void __init paging_init(void) #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #endif - pagetable_init(); load_cr3(swapper_pg_dir); @@ -600,10 +555,10 @@ void __init paging_init(void) * used to involve black magic jumps to work around some nasty CPU bugs, * but fortunately the switch to using exceptions got rid of all that. */ - static void __init test_wp_bit(void) { - printk("Checking if this processor honours the WP bit even in supervisor mode... "); + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); @@ -611,47 +566,46 @@ static void __init test_wp_bit(void) clear_fixmap(FIX_WP_TEST); if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); + printk(KERN_CONT "No.\n"); #ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); #endif } else { - printk("Ok.\n"); + printk(KERN_CONT "Ok.\n"); } } -static struct kcore_list kcore_mem, kcore_vmalloc; +static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { - extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; + int tmp, bad_ppro; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR + "fixmap and kmap areas overlap - this will crash\n"); printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, + FIXADDR_START); BUG(); } #endif - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* - * Only count reserved RAM pages + * Only count reserved RAM pages: */ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; @@ -662,11 +616,12 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, @@ -677,45 +632,46 @@ void __init mem_init(void) ); #if 1 /* double-sanity-check paranoia */ - printk("virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, #ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, #endif - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, - (unsigned long)__va(0), (unsigned long)high_memory, - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, - (unsigned long)&__init_begin, (unsigned long)&__init_end, - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, - (unsigned long)&_etext, (unsigned long)&_edata, - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, - (unsigned long)&_text, (unsigned long)&_etext, - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); #ifdef CONFIG_HIGHMEM - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); - BUG_ON(VMALLOC_END > PKMAP_BASE); + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); - BUG_ON((unsigned long)high_memory > VMALLOC_START); + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ #ifdef CONFIG_X86_PAE @@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(zone, start_pfn, nr_pages); } - #endif -struct kmem_cache *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); -} - /* * This function cannot be __init, since exceptions don't work in that * section. Put this after the callers, so that it cannot be inlined. */ -static int noinline do_test_wp_bit(void) +static noinline int do_test_wp_bit(void) { char tmp_reg; int flag; __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" "2: \n" - ".section __ex_table,\"a\"\n" + ".section __ex_table, \"a\"\n" " .align 4 \n" - " .long 1b,2b \n" + " .long 1b, 2b \n" ".previous \n" :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) :"2" (1) :"memory"); - + return flag; } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -801,32 +746,58 @@ void mark_rodata_ro(void) if (num_possible_cpus() <= 1) #endif { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif } #endif start += size; size = (unsigned long)__end_rodata - start; - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk("Write protecting the kernel read-only data: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); - /* - * change_page_attr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif } #endif void free_init_pages(char *what, unsigned long begin, unsigned long end) { +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else unsigned long addr; + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) totalram_pages++; } printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); +#endif } void free_initmem(void) @@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0f9c8c89065..cc50a13ce8d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -43,12 +43,10 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; @@ -65,22 +63,26 @@ void show_mem(void) { long i, total = 0, reserved = 0; long shared = 0, cached = 0; - pg_data_t *pgdat; struct page *page; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", + nr_swap_pages << (PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* + * This loop can take a while with 256 GB and + * 4k pages so defer the NMI watchdog: + */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) touch_nmi_watchdog(); - } + if (!pfn_valid(pgdat->node_start_pfn + i)) continue; + page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -89,51 +91,58 @@ void show_mem(void) cached++; else if (page_count(page)) shared += page_count(page) - 1; - } + } } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n", reserved); + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages swap cached\n", cached); } int after_bootmem; static __init void *spp_getpage(void) -{ +{ void *ptr; + if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - Dprintk("spp_getpage %p\n", ptr); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + return ptr; -} +} -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot) +static __init void +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte, new_pte; - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); return; } } @@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr, pte = (pte_t *) spp_getpage(); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; } } @@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr, } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); + printk(KERN_ERR "Invalid __set_fixmap\n"); return; } set_pte_phys(address, phys, prot); } -unsigned long __meminitdata table_start, table_end; +static unsigned long __initdata table_start; +static unsigned long __meminitdata table_end; static __meminit void *alloc_low_page(unsigned long *phys) -{ +{ unsigned long pfn = table_end++; void *adr; if (after_bootmem) { adr = (void *)get_zeroed_page(GFP_ATOMIC); *phys = __pa(adr); + return adr; } - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); + if (pfn >= end_pfn) + panic("alloc_low_page: ran out of memory"); adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); @@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys) } static __meminit void unmap_low_page(void *adr) -{ - +{ if (after_bootmem) return; early_iounmap(adr, PAGE_SIZE); -} +} /* Must run before zap_low_mappings */ __meminit void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long vaddr; pmd_t *pmd, *last_pmd; + unsigned long vaddr; int i, pmds; pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; vaddr = __START_KERNEL_map; pmd = level2_kernel_pgt; last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { for (i = 0; i < pmds; i++) { if (pmd_present(pmd[i])) - goto next; + goto continue_outer_loop; } vaddr += addr & ~PMD_MASK; addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + __flush_tlb_all(); + return (void *)vaddr; - next: +continue_outer_loop: ; } - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } -/* To avoid virtual aliases later */ +/* + * To avoid virtual aliases later: + */ __meminit void early_iounmap(void *addr, unsigned long size) { unsigned long vaddr; @@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size) vaddr = (unsigned long)addr; pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) pmd_clear(pmd + i); - __flush_tlb(); + + __flush_tlb_all(); } static void __meminit @@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { - if (!after_bootmem) + if (!after_bootmem) { for (; i < PTRS_PER_PMD; i++, pmd++) set_pmd(pmd, __pmd(0)); + } break; } if (pmd_val(*pmd)) continue; - entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; + entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); } @@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud,0); + pmd_t *pmd = pmd_offset(pud, 0); spin_lock(&init_mm.page_table_lock); phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ +static void __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +{ int i = pud_index(addr); - - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); + if (!after_bootmem && + !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); continue; - } + } if (pud_val(*pud)) { phys_pmd_update(pud, addr, end); @@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne } pmd = alloc_low_page(&pmd_phys); + spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); + unmap_low_page(pmd); } - __flush_tlb(); -} + __flush_tlb_all(); +} static void __init find_early_table_space(unsigned long end) { @@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end) tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - /* RED-PEN putting page tables only on node 0 could - cause a hotspot and fill up ZONE_DMA. The page tables - need roughly 0.5KB per GB. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables); + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ + start = 0x8000; + table_start = find_e820_area(start, end, tables); if (table_start == -1UL) panic("Cannot find space for the kernel page tables"); + /* + * When you have a lot of RAM like 256GB, early_table will not fit + * into 0x8000 range, find_e820_area() will find area after kernel + * bss but the table_start is not page aligned, so need to round it + * up to avoid overlap with bss: + */ + table_start = round_up(table_start, PAGE_SIZE); table_start >>= PAGE_SHIFT; table_end = table_start; @@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end) (table_start << PAGE_SHIFT) + tables); } -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; +{ + unsigned long next; - Dprintk("init_memory_mapping\n"); + pr_debug("init_memory_mapping\n"); - /* + /* * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. */ if (!after_bootmem) find_early_table_space(end); @@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; pud_t *pud; if (after_bootmem) @@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; - if (next > end) - next = end; + if (next > end) + next = end; phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(pud); - } + } if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); + + reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); } #ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -402,39 +439,48 @@ void __init paging_init(void) } #endif -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) +/* + * Unmap a kernel mapping if it exists. This is useful to avoid + * prefetches from the CPU leading to inconsistent cache lines. + * address and size must be aligned to 2MB boundaries. + * Does nothing when the mapping doesn't exist. + */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) { unsigned long end = address + size; BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; + if (pgd_none(*pgd)) continue; + pud = pud_offset(pgd, address); if (pud_none(*pud)) - continue; + continue; + pmd = pmd_offset(pud, address); if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); + continue; + + if (!(pmd_val(*pmd) & _PAGE_PSE)) { + /* + * Could handle this, but it should not happen + * currently: + */ + printk(KERN_ERR "clear_kernel_mapping: " + "mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); } - set_pmd(pmd, __pmd(0)); + set_pmd(pmd, __pmd(0)); } __flush_tlb_all(); -} +} /* * Memory hotplug specific functions @@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, (start + size -1)); + init_memory_mapping(start, start + size-1); ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; + WARN_ON(1); return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, + kcore_modules, kcore_vsyscall; void __init mem_init(void) { @@ -521,8 +535,15 @@ void __init mem_init(void) pci_iommu_alloc(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + /* clear_bss() already clear the empty_zero_page */ + + /* temporary debugging - double check it's true: */ + { + int i; + + for (i = 0; i < 1024; i++) + WARN_ON_ONCE(empty_zero_page[i]); + } reservedpages = 0; @@ -534,7 +555,6 @@ void __init mem_init(void) #endif reservedpages = end_pfn - totalram_pages - absent_pages_in_range(0, end_pfn); - after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -542,15 +562,16 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); kclist_add(&kcore_kernel, &_stext, _end - _stext); kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START); - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), end_pfn << (PAGE_SHIFT-10), codesize >> 10, @@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) - change_page_attr_addr(addr, 1, __pgprot(0)); free_page(addr); totalram_pages++; } - if (addr > __START_KERNEL_map) - global_flush_tlb(); +#endif } void free_initmem(void) @@ -589,6 +618,8 @@ void free_initmem(void) } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -603,25 +634,27 @@ void mark_rodata_ro(void) #ifdef CONFIG_KPROBES start = (unsigned long)__start_rodata; #endif - + end = (unsigned long)__end_rodata; start = (start + PAGE_SIZE - 1) & PAGE_MASK; end &= PAGE_MASK; if (end <= start) return; - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); - /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif } #endif @@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); #endif unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ + /* + * This can happen with kdump kernels when accessing + * firmware tables: + */ if (pfn < end_pfn_map) return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); return; @@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; @@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) } } -int kern_addr_valid(unsigned long addr) -{ +int kern_addr_valid(unsigned long addr) +{ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; if (above != 0 && above != -1UL) - return 0; - + return 0; + pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) return 0; pud = pud_offset(pgd, addr); if (pud_none(*pud)) - return 0; + return 0; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; + if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) return 0; + return pfn_valid(pte_pfn(*pte)); } -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) @@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); } -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. +/* + * Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives: */ int in_gate_area_no_task(unsigned long addr) { @@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma) /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long size, int node) +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) { unsigned long addr = (unsigned long)start_page; unsigned long end = (unsigned long)(start_page + size); @@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page, pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); if (!pud) return -ENOMEM; @@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page, pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { pte_t entry; - void *p = vmemmap_alloc_block(PMD_SIZE, node); + void *p; + + p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); - mk_pte_huge(entry); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", addr, addr + PMD_SIZE - 1, p, node); - } else + } else { vmemmap_verify((pte_t *)pmd, node, addr, next); + } } - return 0; } #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 00000000000..ed795721ca8 --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,501 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> + +enum ioremap_mode { + IOR_MODE_UNCACHED, + IOR_MODE_CACHED, +}; + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + +#endif + +int page_is_ram(unsigned long pagenr) +{ + unsigned long addr, end; + int i; + + for (i = 0; i < e820.nr_map; i++) { + /* + * Not usable memory: + */ + if (e820.map[i].type != E820_RAM) + continue; + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; + + /* + * Sanity check: Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area, which is the + * PCI BIOS area. + */ + if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) && + end < (BIOS_END >> PAGE_SHIFT)) + continue; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int ioremap_change_attr(unsigned long paddr, unsigned long size, + enum ioremap_mode mode) +{ + unsigned long vaddr = (unsigned long)__va(paddr); + unsigned long nrpages = size >> PAGE_SHIFT; + int err, level; + + /* No change for pages after the last mapping */ + if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) + return 0; + + /* + * If there is no identity map for this address, + * change_page_attr_addr is unnecessary + */ + if (!lookup_address(vaddr, &level)) + return 0; + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + err = set_memory_uc(vaddr, nrpages); + break; + case IOR_MODE_CACHED: + err = set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, + enum ioremap_mode mode) +{ + void __iomem *addr; + struct vm_struct *area; + unsigned long offset, last_addr; + pgprot_t prot; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)phys_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && + (offset << PAGE_SHIFT) < last_addr; offset++) { + if (page_is_ram(offset)) + return NULL; + } + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + prot = PAGE_KERNEL_NOCACHE; + break; + case IOR_MODE_CACHED: + prot = PAGE_KERNEL; + break; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, + phys_addr, prot)) { + remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + return NULL; + } + + if (ioremap_change_attr(phys_addr, size, mode) < 0) { + vunmap(addr); + return NULL; + } + + return (void __iomem *) (offset + (char __iomem *)addr); +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); +} +EXPORT_SYMBOL(ioremap_nocache); + +void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_CACHED); +} +EXPORT_SYMBOL(ioremap_cache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +#ifdef CONFIG_X86_32 + +int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static __initdata unsigned long bm_pte[1024] + __attribute__((aligned(PAGE_SIZE))); + +static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +{ + return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); +} + +static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +{ + return bm_pte + ((addr >> PAGE_SHIFT) & 1023); +} + +void __init early_ioremap_init(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = __pa(bm_pte) | _PAGE_TABLE; + memset(bm_pte, 0, sizeof(bm_pte)); + /* + * The boot-ioremap range spans multiple pgds, for which + * we are not prepared: + */ + if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pgd %p != %p\n", + pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init early_ioremap_clear(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_clear()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = 0; + paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + __flush_tlb_all(); +} + +void __init early_ioremap_reset(void) +{ + enum fixed_addresses idx; + unsigned long *pte, phys, addr; + + after_paging_init = 1; + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { + addr = fix_to_virt(idx); + pte = early_ioremap_pte(addr); + if (!*pte & _PAGE_PRESENT) { + phys = *pte & PAGE_MASK; + set_fixmap(idx, phys); + } + } +} + +static void __init __early_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + unsigned long *pte, addr = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) + *pte = (phys & PAGE_MASK) | pgprot_val(flags); + else + *pte = 0; + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + unsigned long phys) +{ + if (after_paging_init) + set_fixmap(idx, phys); + else + __early_set_fixmap(idx, phys, PAGE_KERNEL); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + + +int __initdata early_ioremap_nested; + +static int __init check_early_ioremap_leak(void) +{ + if (!early_ioremap_nested) + return 0; + + printk(KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + early_ioremap_nested); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + WARN_ON(1); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages, nesting; + enum fixed_addresses idx0, idx; + + WARN_ON(system_state != SYSTEM_BOOTING); + + nesting = early_ioremap_nested; + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", + phys_addr, size, nesting); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + if (nesting >= FIX_BTMAPS_NESTING) { + WARN_ON(1); + return NULL; + } + early_ioremap_nested++; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + + return (void *) (offset + fix_to_virt(idx0)); +} + +void __init early_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + unsigned int nesting; + + nesting = --early_ioremap_nested; + WARN_ON(nesting < 0); + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, nesting); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c deleted file mode 100644 index 0b278315d73..00000000000 --- a/arch/x86/mm/ioremap_32.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t prot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) phys_to_virt(phys_addr); - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (phys_addr <= virt_to_phys(high_memory - 1)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } - - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY - | _PAGE_ACCESSED | flags); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, prot)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (last_addr < virt_to_phys(high_memory) - 1) { - struct page *ppage = virt_to_page(__va(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if ((void __force *)addr <= high_memory) - return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { - change_page_attr(virt_to_page(__va(p->phys_addr)), - get_vm_area_size(p) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return phys_to_virt(phys_addr); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c deleted file mode 100644 index 6cac90aa503..00000000000 --- a/arch/x86/mm/ioremap_64.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> - -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> -#include <asm/proto.h> - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Fix up the linear direct mapping of the kernel to avoid cache attribute - * conflicts. - */ -static int -ioremap_change_attr(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - int err = 0; - if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vaddr = (unsigned long) __va(phys_addr); - - /* - * Must use a address here and not struct page because the phys addr - * can be a in hole between nodes and not have an memmap entry. - */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); - if (!err) - global_flush_tlb(); - } - return err; -} - -/* - * Generic mapping function - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (__force void __iomem *)phys_to_virt(phys_addr); - -#ifdef CONFIG_FLATMEM - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (last_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } -#endif - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL - | _PAGE_DIRTY | _PAGE_ACCESSED | flags); - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); - return NULL; - } - if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { - area->flags &= 0xffffff; - vunmap(addr); - return NULL; - } - return (__force void __iomem *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return __ioremap(phys_addr, size, _PAGE_PCD); -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if (addr <= high_memory) - return; - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index a96006f7ae0..7a2ebce87df 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -1,9 +1,9 @@ -/* +/* * AMD K8 NUMA support. * Discover the memory map and associated nodes. - * + * * This version reads it directly from the K8 northbridge. - * + * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -22,132 +22,135 @@ static __init int find_northbridge(void) { - int num; + int num; - for (num = 0; num < 32; num++) { + for (num = 0; num < 32; num++) { u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) - continue; - return num; - } - - return -1; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -1; } int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ +{ unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, j, nb; + int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned num_cores; + unsigned cores; + unsigned bits; + int j; if (!early_pci_allowed()) return -1; - nb = find_northbridge(); - if (nb < 0) + nb = find_northbridge(); + if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - reg = read_pci_config(0, nb, 0, 0x60); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); - memset(&nodes,0,sizeof(nodes)); + memset(&nodes, 0, sizeof(nodes)); prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base,limit; + for (i = 0; i < 8; i++) { + unsigned long base, limit; u32 nodeid; - + base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); - nodeid = limit & 7; + nodeid = limit & 7; nodeids[i] = nodeid; - if ((base & 3) == 0) { + if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + printk("Skipping disabled node %d\n", i); continue; - } + } if (nodeid >= numnodes) { printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + base, limit); continue; - } + } - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, - base); + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", + printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; } - limit >>= 16; - limit <<= 24; + limit >>= 16; + limit <<= 24; limit |= (1<<24)-1; limit++; if (limit > end_pfn << PAGE_SHIFT) limit = end_pfn << PAGE_SHIFT; if (limit <= base) - continue; - + continue; + base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; } - if (limit < base) { + if (limit < base) { printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); + nodeid, base, limit); continue; - } - + } + /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { + if (prevbase > base) { printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase,base); + prevbase, base); return -1; } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + found++; - - nodes[nodeid].start = base; + + nodes[nodeid].start = base; nodes[nodeid].end = limit; e820_register_active_regions(nodeid, nodes[nodeid].start >> PAGE_SHIFT, @@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; node_set(nodeid, node_possible_map); - } + } if (!found) - return -1; + return -1; memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + /* use the coreid bits from early_identify_cpu */ + bits = boot_cpu_data.x86_coreid_bits; + cores = (1<<bits); for (i = 0; i < 8; i++) { - if (nodes[i].start != nodes[i].end) { + if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - for (j = 0; j < num_cores; j++) - apicid_to_node[(nodeid * num_cores) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } + for (j = 0; j < cores; j++) + apicid_to_node[(nodeid << bits) + j] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } } numa_init_array(); return 0; -} +} diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c index 552e0847375..56fe7124fbe 100644 --- a/arch/x86/mm/mmap_32.c +++ b/arch/x86/mm/mmap.c @@ -1,10 +1,13 @@ /* - * linux/arch/i386/mm/mmap.c + * Flexible mmap layout support * - * flexible mmap layout support + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,14 +22,12 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> */ #include <linux/personality.h> #include <linux/mm.h> #include <linux/random.h> +#include <linux/limits.h> #include <linux/sched.h> /* @@ -37,20 +38,71 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline unsigned long mmap_base(struct mm_struct *mm) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static int mmap_is_ia32(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long mmap_rnd(void) +{ + unsigned long rnd = 0; + + /* + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (current->flags & PF_RANDOMIZE) { + if (mmap_is_ia32()) + rnd = (long)get_random_int() % (1<<8); + else + rnd = (long)(get_random_int() % (1<<28)); + } + return rnd << PAGE_SHIFT; +} + +static unsigned long mmap_base(void) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); + return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); +} + +/* + * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 + * does, but not when emulating X86_32 + */ +static unsigned long mmap_legacy_base(void) +{ + if (mmap_is_ia32()) + return TASK_UNMAPPED_BASE; + else + return TASK_UNMAPPED_BASE + mmap_rnd(); } /* @@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; + if (mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { - mm->mmap_base = mmap_base(mm); + mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; mm->unmap_area = arch_unmap_area_topdown; } diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c deleted file mode 100644 index 80bba0dc000..00000000000 --- a/arch/x86/mm/mmap_64.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright 2005 Andi Kleen, SuSE Labs. - * Licensed under GPL, v.2 - */ -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/random.h> -#include <asm/ia32.h> - -/* Notebook: move the mmap code from sys_x86_64.c over here. */ - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->flags & _TIF_IA32) - return ia32_pick_mmap_layout(mm); -#endif - mm->mmap_base = TASK_UNMAPPED_BASE; - if (current->flags & PF_RANDOMIZE) { - /* Add 28bit randomness which is about 40bits of address space - because mmap base has to be page aligned. - or ~1/128 of the total user VM - (total user address space is 47bits) */ - unsigned rnd = get_random_int() & 0xfffffff; - mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; - } - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d6926ba899..dc3b1f7e145 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,7 +1,7 @@ -/* +/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ + */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> @@ -11,35 +11,45 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/nodemask.h> +#include <linux/sched.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/numa.h> #include <asm/acpi.h> +#include <asm/k8.h> #ifndef Dprintk #define Dprintk(x...) #endif struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { +int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +void *x86_cpu_to_node_map_early_ptr; +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); + +s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_to_cpumask_map); int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +static int __init populate_memnodemap(const struct bootnode *nodes, + int numnodes, int shift) { - int i; - int res = -1; unsigned long addr, end; + int i, res = -1; - memset(memnodemap, 0xff, memnodemapsize); + memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; @@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) if ((end >> shift) >= memnodemapsize) return 0; do { - if (memnodemap[addr >> shift] != 0xff) + if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; memnodemap[addr >> shift] = i; addr += (1UL << shift); } while (addr < end); res = 1; - } + } return res; } @@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void) unsigned long pad, pad_addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) + if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) return 0; pad = L1_CACHE_BYTES - 1; pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; + nodemap_size = pad + sizeof(s16) * memnodemapsize; nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, nodemap_size); if (nodemap_addr == -1UL) { @@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void) } pad_addr = (nodemap_addr + pad) & ~pad; memnodemap = phys_to_virt(pad_addr); + reserve_early(nodemap_addr, nodemap_addr + nodemap_size); printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", nodemap_addr, nodemap_addr + nodemap_size); @@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init -extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +static int __init extract_lsb_from_nodes(const struct bootnode *nodes, + int numnodes) { int i, nodes_used = 0; unsigned long start, end; @@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) shift); if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); + printk(KERN_INFO "Your memory is not aligned you need to " + "rebuild your kernel with a bigger NODEMAPSIZE " + "shift=%d\n", shift); return -1; } return shift; } -#ifdef CONFIG_SPARSEMEM int early_pfn_to_nid(unsigned long pfn) { return phys_to_nid(pfn << PAGE_SHIFT); } -#endif -static void * __init -early_node_mem(int nodeid, unsigned long start, unsigned long end, - unsigned long size) +static void * __init early_node_mem(int nodeid, unsigned long start, + unsigned long end, unsigned long size) { unsigned long mem = find_e820_area(start, end, size); void *ptr; + if (mem != -1L) return __va(mem); ptr = __alloc_bootmem_nopanic(size, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); if (ptr == NULL) { printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); + size, nodeid); return NULL; } return ptr; } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; - unsigned long nodedata_phys; +void __init setup_node_bootmem(int nodeid, unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); - start = round_up(start, ZONE_ALIGN); + start = round_up(start, ZONE_ALIGN); - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, + start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; @@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<<PAGE_SHIFT); if (bootmap == NULL) { if (nodedata_phys < start || nodedata_phys >= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + free_bootmem((unsigned long)node_data[nodeid], + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - + Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + bootmap_start >> PAGE_SHIFT, + start_pfn, end_pfn); free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, + bootmap_pages<<PAGE_SHIFT); #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif node_set_online(nodeid); -} - -/* Initialize final allocator for a zone */ -void __init setup_node_zones(int nodeid) -{ - unsigned long start_pfn, end_pfn, memmapsize, limit; - - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); - - Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", - nodeid, start_pfn, end_pfn); - - /* Try to allocate mem_map at end to not fill up precious <4GB - memory. */ - memmapsize = sizeof(struct page) * (end_pfn-start_pfn); - limit = end_pfn << PAGE_SHIFT; -#ifdef CONFIG_FLAT_NODE_MEM_MAP - NODE_DATA(nodeid)->node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} +} +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ void __init numa_init_array(void) { int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ + rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) != NUMA_NO_NODE) + if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; - numa_set_node(i, rr); + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); } - } #ifdef CONFIG_NUMA_EMU @@ -276,15 +265,17 @@ void __init numa_init_array(void) char *cmdline __initdata; /* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. + * Setups up nid to range from addr to addr + size. If the end + * boundary is greater than max_addr, then max_addr is used instead. + * The return value is 0 if there is additional memory left for + * allocation past addr and -1 otherwise. addr is adjusted to be at + * the end of the node. */ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, u64 size, u64 max_addr) { int ret = 0; + nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, for (i = node_start; i < num_nodes + node_start; i++) { u64 end = *addr + size; + if (i < big) end += FAKE_NODE_MIN_SIZE; /* @@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; + u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; + int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); /* @@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) * system RAM into N fake nodes. */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); + long n = simple_strtol(cmdline, NULL, 0); + + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; goto out; @@ -483,46 +471,47 @@ out: for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); } acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; + numa_init_array(); + return 0; } #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ +{ int i; nodes_clear(node_possible_map); #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) + if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT)) return; nodes_clear(node_possible_map); #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; + end_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ + memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); @@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - node_to_cpumask[0] = cpumask_of_cpu(0); + /* cpumask_of_cpu() may not be available during early startup */ + memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); + cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } __cpuinit void numa_add_cpu(int cpu) { - set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} + set_bit(cpu, + (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); +} void __cpuinit numa_set_node(int cpu, int node) { + int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; + cpu_pda(cpu)->nodenumber = node; - cpu_to_node(cpu) = node; + + if(cpu_to_node_map) + cpu_to_node_map[cpu] = node; + else if(per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); } -unsigned long __init numa_free_all_bootmem(void) -{ - int i; +unsigned long __init numa_free_all_bootmem(void) +{ unsigned long pages = 0; - for_each_online_node(i) { + int i; + + for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - } + return pages; -} +} void __init paging_init(void) -{ - int i; +{ unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -568,32 +569,27 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - for_each_online_node(i) { - setup_node_zones(i); - } - free_area_init_nodes(max_zone_pfns); -} +} static __init int numa_setup(char *opt) -{ +{ if (!opt) return -EINVAL; - if (!strncmp(opt,"off",3)) + if (!strncmp(opt, "off", 3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; + if (!strncmp(opt, "hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; -} - +} early_param("numa", numa_setup); /* @@ -611,38 +607,16 @@ early_param("numa", numa_setup); void __init init_cpu_to_node(void) { int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid_init[i]; + + for (i = 0; i < NR_CPUS; i++) { + u16 apicid = x86_cpu_to_apicid_init[i]; + if (apicid == BAD_APICID) continue; if (apicid_to_node[apicid] == NUMA_NO_NODE) continue; - numa_set_node(i,apicid_to_node[apicid]); + numa_set_node(i, apicid_to_node[apicid]); } } -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); - -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c new file mode 100644 index 00000000000..06353d43f72 --- /dev/null +++ b/arch/x86/mm/pageattr-test.c @@ -0,0 +1,224 @@ +/* + * self test for change_page_attr. + * + * Clears the global bit on random pages in the direct mapping, then reverts + * and compares page tables forwards and afterwards. + */ +#include <linux/bootmem.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +enum { + NTEST = 4000, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static __init int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int printed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + if (!printed) { + dump_pagetable(addr); + printk(KERN_INFO "CPA %lx no pte level %d\n", + addr, level); + printed = 1; + } + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if (!(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + printk(KERN_INFO + "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long __initdata addr[NTEST]; +static unsigned int __initdata len[NTEST]; + +/* Change the global bit on random pages in the direct mapping */ +static __init int exercise_pageattr(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + int level; + int i, k; + int err; + + printk(KERN_INFO "CPA exercising pageattr\n"); + + bm = vmalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + memset(bm, 0, (max_pfn_mapped + 7) / 8); + + failed += print_split(&sa); + srandom32(100); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = random32() % max_pfn_mapped; + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = random32() % 100; + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + err = change_page_attr_clear(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || pte_global(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + printk(KERN_INFO "CPA reverting everything\n"); + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + err = change_page_attr_set(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || !pte_global(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + WARN_ON(1); + } else { + printk(KERN_INFO "CPA selftests PASSED\n"); + } + + return 0; +} +module_init(exercise_pageattr); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c new file mode 100644 index 00000000000..1cc6607eacb --- /dev/null +++ b/arch/x86/mm/pageattr.c @@ -0,0 +1,564 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/e820.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} + +static void __cpa_flush_all(void *arg) +{ + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (boot_cpu_data.x86_model >= 4) + wbinvd(); +} + +static void cpa_flush_all(void) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, NULL, 1, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1, 1); + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && pte_present(*pte)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) +{ + pgprot_t forbidden = __pgprot(0); + + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ + if (within(__pa(address), BIOS_BEGIN, BIOS_END)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + +#ifdef CONFIG_DEBUG_RODATA + /* The .rodata section needs to be read-only */ + if (within(address, (unsigned long)__start_rodata, + (unsigned long)__end_rodata)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +pte_t *lookup_address(unsigned long address, int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); +} + +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + gfp_t gfp_flags = GFP_KERNEL; + unsigned long flags; + unsigned long addr; + pte_t *pbase, *tmp; + struct page *base; + unsigned int i, level; + +#ifdef CONFIG_DEBUG_PAGEALLOC + gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; + gfp_flags = GFP_ATOMIC | __GFP_NOWARN; +#endif + base = alloc_pages(gfp_flags, 0); + if (!base) + return -ENOMEM; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) { + WARN_ON_ONCE(1); + goto out_unlock; + } + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); +#ifdef CONFIG_X86_32 + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); +#endif + + pgprot_val(ref_prot) &= ~_PAGE_NX; + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) + set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); + + /* + * Install the new, split up pagetable. Important detail here: + * + * On Intel the NX bit of all levels must be cleared to make a + * page executable. See section 4.13.2 of Intel 64 and IA-32 + * Architectures Software Developer's Manual). + */ + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); + base = NULL; + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + if (base) + __free_pages(base, 0); + + return 0; +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, + pgprot_t mask_set, pgprot_t mask_clr) +{ + struct page *kpte_page; + int level, err = 0; + pte_t *kpte; + +#ifdef CONFIG_X86_32 + BUG_ON(pfn > max_low_pfn); +#endif + +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return -EINVAL; + + kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + + if (level == PG_LEVEL_4K) { + pgprot_t new_prot = pte_pgprot(*kpte); + pte_t new_pte, old_pte = *kpte; + + pgprot_val(new_prot) &= ~pgprot_val(mask_clr); + pgprot_val(new_prot) |= pgprot_val(mask_set); + + new_prot = static_protections(new_prot, address); + + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); + + set_pte_atomic(kpte, new_pte); + } else { + err = split_large_page(kpte, address); + if (!err) + goto repeat; + } + return err; +} + +/** + * change_page_attr_addr - Change page table attributes in linear mapping + * @address: Virtual address in linear mapping. + * @prot: New page table attribute (PAGE_*) + * + * Change page attributes of a page in the direct mapping. This is a variant + * of change_page_attr() that also works on memory holes that do not have + * mem_map entry (pfn_valid() is false). + * + * See change_page_attr() documentation for more details. + * + * Modules and drivers should use the set_memory_* APIs instead. + */ + +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + +static int +change_page_attr_addr(unsigned long address, pgprot_t mask_set, + pgprot_t mask_clr) +{ + unsigned long phys_addr = __pa(address); + unsigned long pfn = phys_addr >> PAGE_SHIFT; + int err; + +#ifdef CONFIG_X86_64 + /* + * If we are inside the high mapped kernel range, then we + * fixup the low mapping first. __va() returns the virtual + * address in the linear mapping: + */ + if (within(address, HIGH_MAP_START, HIGH_MAP_END)) + address = (unsigned long) __va(phys_addr); +#endif + + err = __change_page_attr(address, pfn, mask_set, mask_clr); + if (err) + return err; + +#ifdef CONFIG_X86_64 + /* + * If the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) { + /* + * Calc the high mapping address. See __phys_addr() + * for the non obvious details. + */ + address = phys_addr + HIGH_MAP_START - phys_base; + /* Make sure the kernel mappings stay executable */ + pgprot_val(mask_clr) |= _PAGE_NX; + + /* + * Our high aliases are imprecise, because we check + * everything between 0 and KERNEL_TEXT_SIZE, so do + * not propagate lookup failures back to users: + */ + __change_page_attr(address, pfn, mask_set, mask_clr); + } +#endif + return err; +} + +static int __change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + unsigned int i; + int ret; + + for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { + ret = change_page_attr_addr(addr, mask_set, mask_clr); + if (ret) + return ret; + } + + return 0; +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + int ret = __change_page_attr_set_clr(addr, numpages, mask_set, + mask_clr); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) + cpa_flush_range(addr, numpages); + else + cpa_flush_all(); + + return ret; +} + +static inline int change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_uc); + +int set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) +static inline int __change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int __change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); +} +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PRESENT | _PAGE_RW)); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PRESENT)); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + + /* + * The return value is ignored - the calls cannot fail, + * large pages are disabled at boot time: + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); +} +#endif + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c deleted file mode 100644 index 260073c0760..00000000000 --- a/arch/x86/mm/pageattr_32.c +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/sections.h> - -static DEFINE_SPINLOCK(cpa_lock); -static struct list_head df_list = LIST_HEAD_INIT(df_list); - - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - return pte_offset_kernel(pmd, address); -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base; - pte_t *pbase; - - spin_unlock_irq(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); - spin_lock_irq(&cpa_lock); - if (!base) - return NULL; - - /* - * page_private is used to track the number of entries in - * the page table page that have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot)); - } - return base; -} - -static void cache_flush_page(struct page *p) -{ - void *adr = page_address(p); - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *lh = (struct list_head *)arg; - struct page *p; - - /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { - list_for_each_entry (p, lh, lru) - cache_flush_page(p); - } else if (boot_cpu_data.x86_model >= 4) - wbinvd(); - - /* Flush all to work around Errata in early athlons regarding - * large page flushing. - */ - __flush_tlb_all(); -} - -static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) -{ - struct page *page; - unsigned long flags; - - set_pte_atomic(kpte, pte); /* change init_mm */ - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = (struct page *)page->index) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static inline void revert_page(struct page *kpte_page, unsigned long address) -{ - pgprot_t ref_prot; - pte_t *linear; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE; - - linear = (pte_t *) - pmd_offset(pud_offset(pgd_offset_k(address), address), address); - set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, - ref_prot)); -} - -static inline void save_page(struct page *kpte_page) -{ - if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) - list_add(&kpte_page->lru, &df_list); -} - -static int -__change_page_attr(struct page *page, pgprot_t prot) -{ - pte_t *kpte; - unsigned long address; - struct page *kpte_page; - - BUG_ON(PageHighMem(page)); - address = (unsigned long)page_address(page); - - kpte = lookup_address(address); - if (!kpte) - return -EINVAL; - kpte_page = virt_to_page(kpte); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - - if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, prot)); - } else { - pgprot_t ref_prot; - struct page *split; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_EXEC : PAGE_KERNEL; - split = split_large_page(address, prot, ref_prot); - if (!split) - return -ENOMEM; - set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* - * If the pte was reserved, it means it was created at boot - * time (not via split_large_page) and in turn we must not - * replace it with a largepage. - */ - - save_page(kpte_page); - if (!PageReserved(kpte_page)) { - if (cpu_has_pse && (page_private(kpte_page) == 0)) { - paravirt_release_pt(page_to_pfn(kpte_page)); - revert_page(kpte_page, address); - } - } - return 0; -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - int err = 0; - int i; - unsigned long flags; - - spin_lock_irqsave(&cpa_lock, flags); - for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); - if (err) - break; - } - spin_unlock_irqrestore(&cpa_lock, flags); - return err; -} - -void global_flush_tlb(void) -{ - struct list_head l; - struct page *pg, *next; - - BUG_ON(irqs_disabled()); - - spin_lock_irq(&cpa_lock); - list_replace_init(&df_list, &l); - spin_unlock_irq(&cpa_lock); - flush_map(&l); - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (PageHighMem(page)) - return; - if (!enable) - debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); - - /* the return value is ignored - the calls cannot fail, - * large pages are disabled at boot time. - */ - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); - /* we should perform an IPI and flush all tlbs, - * but that can deadlock->flush only current cpu. - */ - __flush_tlb_all(); -} -#endif - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c deleted file mode 100644 index c40afbaaf93..00000000000 --- a/arch/x86/mm/pageattr_64.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -void clflush_cache_range(void *adr, int size) -{ - int i; - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - clflush_cache_range(adr, PAGE_SIZE); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - pgprot_val(ref_prot2) &= ~_PAGE_NX; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - /* - * Write-protect the semaphore, to exclude two contexts - * doing a list_replace_init() call in parallel and to - * exclude new additions to the deferred_pages list: - */ - down_write(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_write(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index be61a1d845a..2ae5999a795 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void pmd_ctor(struct kmem_cache *cache, void *pmd) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the @@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd) * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); - pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + + list_add(&page->lru, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page_private(page); - *pprev = next; - if (next) - set_page_private(next, (unsigned long)pprev); + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); } @@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd) if (SHARED_KERNEL_PMD) return; - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) -/* If we allocate a pmd for part of the kernel address space, then - make sure its initialized with the appropriate kernel mappings. - Otherwise use a cached zeroed pmd. */ -static pmd_t *pmd_cache_alloc(int idx) +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(pgd_t *pgdp) { - pmd_t *pmd; + int i; - if (idx >= USER_PTRS_PER_PGD) { - pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; - if (pmd) - memcpy(pmd, - (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(pgd); + return 0; + } + + if (i >= USER_PTRS_PER_PGD) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); - } else - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - return pmd; + pud_populate(mm, pud, pmd); + } + + return 1; +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; } -static void pmd_cache_free(pmd_t *pmd, int idx) +static void pgd_mop_up_pmds(pgd_t *pgd) { - if (idx >= USER_PTRS_PER_PGD) - free_page((unsigned long)pmd); - else - kmem_cache_free(pmd_cache, pmd); } +#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + mm->pgd = pgd; /* so that alloc_pd can use it */ - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pmd_t *pmd = pmd_cache_alloc(i); - - if (!pmd) - goto out_oom; - - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + quicklist_free(0, pgd_dtor, pgd); + pgd = NULL; } - return pgd; -out_oom: - for (i--; i >= 0; i--) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - quicklist_free(0, pgd_dtor, pgd); - return NULL; + return pgd; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - /* in the non-PAE case, free_pgtables() clears user pgd entries */ + pgd_mop_up_pmds(pgd); quicklist_free(0, pgd_dtor, pgd); } @@ -372,4 +376,3 @@ void check_pgt_cache(void) { quicklist_trim(0, pgd_dtor, 25, 16); } - diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index ea85172fc0c..65416f843e5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -130,6 +130,9 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { int pxm, node; + int apic_id; + + apic_id = pa->apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { @@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) bad_srat(); return; } - apicid_to_node[pa->apic_id] = node; + apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Protect against too large hotadd areas that would fill up memory. - */ -static int hotadd_enough_memory(struct bootnode *nd) -{ - static unsigned long allocated; - static unsigned long last_area_end; - unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; - long mem = pages * sizeof(struct page); - unsigned long addr; - unsigned long allowed; - unsigned long oldpages = pages; - - if (mem < 0) - return 0; - allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; - allowed = (allowed / 100) * hotadd_percent; - if (allocated + mem > allowed) { - unsigned long range; - /* Give them at least part of their hotadd memory upto hotadd_percent - It would be better to spread the limit out - over multiple hotplug areas, but that is too complicated - right now */ - if (allocated >= allowed) - return 0; - range = allowed - allocated; - pages = (range / PAGE_SIZE); - mem = pages * sizeof(struct page); - nd->end = nd->start + range; - } - /* Not completely fool proof, but a good sanity check */ - addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); - if (addr == -1UL) - return 0; - if (pages != oldpages) - printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", - pages << PAGE_SHIFT); - last_area_end = addr + mem; - allocated += mem; - return 1; -} - -static int update_end_of_memory(unsigned long end) -{ - found_add_area = 1; - if ((end >> PAGE_SHIFT) > end_pfn) - end_pfn = end >> PAGE_SHIFT; - return 1; + pxm, apic_id, node); } -static inline int save_add_info(void) -{ - return hotadd_percent > 0; -} -#else int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE @@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif -#endif /* * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add infomation. + * Both SPARSE and RESERVE need nodes_add information. * This code supports one contiguous hot add area per node. */ static int reserve_hotadd(int node, unsigned long start, unsigned long end) @@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void unparse_node(int node) +static void __init unparse_node(int node) { int i; node_clear(node, nodes_parsed); @@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + /* + * don't confuse VM with a node that doesn't have the + * minimum memory. + */ + if (nodes[i].end && + (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); node_set_offline(i); } @@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) setup_node_bootmem(i, nodes[i].start, nodes[i].end); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) == NUMA_NO_NODE) + int node = early_cpu_to_node(i); + + if (node == NUMA_NO_NODE) continue; - if (!node_isset(cpu_to_node(i), node_possible_map)) + if (!node_isset(node, node_possible_map)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); @@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } #ifdef CONFIG_NUMA_EMU +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL +}; +static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; static int __init find_node_by_addr(unsigned long addr) { int ret = NUMA_NO_NODE; @@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr) break; } } - return i; + return ret; } /* @@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr) void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { int i, j; - int fake_node_to_pxm_map[MAX_NUMNODES] = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL - }; - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE - }; printk(KERN_INFO "Faking PXM affinity for fake nodes on real " "topology.\n"); |