aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile_323
-rw-r--r--arch/x86/mm/Makefile_643
-rw-r--r--arch/x86/mm/boot_ioremap_32.c100
-rw-r--r--arch/x86/mm/discontig_32.c110
-rw-r--r--arch/x86/mm/extable.c62
-rw-r--r--arch/x86/mm/extable_32.c35
-rw-r--r--arch/x86/mm/extable_64.c34
-rw-r--r--arch/x86/mm/fault.c986
-rw-r--r--arch/x86/mm/fault_32.c659
-rw-r--r--arch/x86/mm/fault_64.c623
-rw-r--r--arch/x86/mm/highmem_32.c47
-rw-r--r--arch/x86/mm/hugetlbpage.c3
-rw-r--r--arch/x86/mm/init_32.c425
-rw-r--r--arch/x86/mm/init_64.c418
-rw-r--r--arch/x86/mm/ioremap.c501
-rw-r--r--arch/x86/mm/ioremap_32.c274
-rw-r--r--arch/x86/mm/ioremap_64.c210
-rw-r--r--arch/x86/mm/k8topology_64.c173
-rw-r--r--arch/x86/mm/mmap.c (renamed from arch/x86/mm/mmap_32.c)86
-rw-r--r--arch/x86/mm/mmap_64.c29
-rw-r--r--arch/x86/mm/numa_64.c274
-rw-r--r--arch/x86/mm/pageattr-test.c224
-rw-r--r--arch/x86/mm/pageattr.c564
-rw-r--r--arch/x86/mm/pageattr_32.c278
-rw-r--r--arch/x86/mm/pageattr_64.c255
-rw-r--r--arch/x86/mm/pgtable_32.c145
-rw-r--r--arch/x86/mm/srat_64.c95
27 files changed, 3286 insertions, 3330 deletions
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index 362b4ad082d..c36ae88bb54 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,9 +2,8 @@
# Makefile for the linux i386-specific parts of the memory manager.
#
-obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
+obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
obj-$(CONFIG_NUMA) += discontig_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
index 6bcb47945b8..688c8c28ac8 100644
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,9 +2,8 @@
# Makefile for the linux x86_64-specific parts of the memory manager.
#
-obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
+obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_NUMA) += numa_64.o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
deleted file mode 100644
index f14da2a53ec..00000000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * arch/i386/mm/boot_ioremap.c
- *
- * Re-map functions for early boot-time before paging_init() when the
- * boot-time pagetables are still in use
- *
- * Written by Dave Hansen <haveblue@us.ibm.com>
- */
-
-
-/*
- * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
- * keeps that from happening. If anyone has a better way, I'm listening.
- *
- * boot_pte_t is defined only if this all works correctly
- */
-
-#undef CONFIG_X86_PAE
-#undef CONFIG_PARAVIRT
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/init.h>
-#include <linux/stddef.h>
-
-/*
- * I'm cheating here. It is known that the two boot PTE pages are
- * allocated next to each other. I'm pretending that they're just
- * one big array.
- */
-
-#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-
-static unsigned long boot_pte_index(unsigned long vaddr)
-{
- return __pa(vaddr) >> PAGE_SHIFT;
-}
-
-static inline boot_pte_t* boot_vaddr_to_pte(void *address)
-{
- boot_pte_t* boot_pg = (boot_pte_t*)pg0;
- return &boot_pg[boot_pte_index((unsigned long)address)];
-}
-
-/*
- * This is only for a caller who is clever enough to page-align
- * phys_addr and virtual_source, and who also has a preference
- * about which virtual address from which to steal ptes
- */
-static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages,
- void* virtual_source)
-{
- boot_pte_t* pte;
- int i;
- char *vaddr = virtual_source;
-
- pte = boot_vaddr_to_pte(virtual_source);
- for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
- set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
- __flush_tlb_one(&vaddr[i*PAGE_SIZE]);
- }
-}
-
-/* the virtual space we're going to remap comes from this array */
-#define BOOT_IOREMAP_PAGES 4
-#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
- __attribute__ ((aligned (PAGE_SIZE)));
-
-/*
- * This only applies to things which need to ioremap before paging_init()
- * bt_ioremap() and plain ioremap() are both useless at this point.
- *
- * When used, we're still using the boot-time pagetables, which only
- * have 2 PTE pages mapping the first 8MB
- *
- * There is no unmap. The boot-time PTE pages aren't used after boot.
- * If you really want the space back, just remap it yourself.
- * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
- */
-__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
-{
- unsigned long last_addr, offset;
- unsigned int nrpages;
-
- last_addr = phys_addr + size - 1;
-
- /* page align the requested address */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr) - phys_addr;
-
- nrpages = size >> PAGE_SHIFT;
- if (nrpages > BOOT_IOREMAP_PAGES)
- return NULL;
-
- __boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
-
- return &boot_ioremap_space[offset];
-}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 13a474d3c6e..04b1d20e261 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -32,6 +32,7 @@
#include <linux/kexec.h>
#include <linux/pfn.h>
#include <linux/swap.h>
+#include <linux/acpi.h>
#include <asm/e820.h>
#include <asm/setup.h>
@@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn;
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
unsigned long node_remap_size[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
static void *node_remap_start_vaddr[MAX_NUMNODES];
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
static unsigned long kva_start_pfn;
static unsigned long kva_pages;
/*
@@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid)
}
}
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * In the discontig memory model, a portion of the kernel virtual area (KVA)
+ * is reserved and portions of nodes are mapped using it. This is to allow
+ * node-local memory to be allocated for structures that would normally require
+ * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
+ * should be prepared to allocate from the bootmem allocator instead. This KVA
+ * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
+ * layout of memory that are broken if alloc_remap() succeeds for some of the
+ * map and fails for others
+ */
+static unsigned long node_remap_start_pfn[MAX_NUMNODES];
+static void *node_remap_end_vaddr[MAX_NUMNODES];
+static void *node_remap_alloc_vaddr[MAX_NUMNODES];
+static unsigned long node_remap_offset[MAX_NUMNODES];
+
void *alloc_remap(int nid, unsigned long size)
{
void *allocation = node_remap_alloc_vaddr[nid];
@@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void)
return reserve_pages;
}
+static void init_remap_allocator(int nid)
+{
+ node_remap_start_vaddr[nid] = pfn_to_kaddr(
+ kva_start_pfn + node_remap_offset[nid]);
+ node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+ (node_remap_size[nid] * PAGE_SIZE);
+ node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+ ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+ printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+ (ulong) node_remap_start_vaddr[nid],
+ (ulong) pfn_to_kaddr(highstart_pfn
+ + node_remap_offset[nid] + node_remap_size[nid]));
+}
+#else
+void *alloc_remap(int nid, unsigned long size)
+{
+ return NULL;
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+ return 0;
+}
+
+static void init_remap_allocator(int nid)
+{
+}
+
+void __init remap_numa_kva(void)
+{
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
extern void setup_bootmem_allocator(void);
unsigned long __init setup_memory(void)
{
int nid;
unsigned long system_start_pfn, system_max_low_pfn;
+ unsigned long wasted_pages;
/*
* When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -288,11 +336,18 @@ unsigned long __init setup_memory(void)
#ifdef CONFIG_BLK_DEV_INITRD
/* Numa kva area is below the initrd */
- if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image)
- kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image)
+ if (initrd_start)
+ kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
- kva_pages;
#endif
- kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+
+ /*
+ * We waste pages past at the end of the KVA for no good reason other
+ * than how it is located. This is bad.
+ */
+ wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
+ kva_start_pfn -= wasted_pages;
+ kva_pages += wasted_pages;
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
@@ -318,19 +373,9 @@ unsigned long __init setup_memory(void)
printk("Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for_each_online_node(nid) {
- node_remap_start_vaddr[nid] = pfn_to_kaddr(
- kva_start_pfn + node_remap_offset[nid]);
- /* Init the node remap allocator */
- node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
- (node_remap_size[nid] * PAGE_SIZE);
- node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
- ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+ init_remap_allocator(nid);
allocate_pgdat(nid);
- printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
- (ulong) node_remap_start_vaddr[nid],
- (ulong) pfn_to_kaddr(highstart_pfn
- + node_remap_offset[nid] + node_remap_size[nid]));
}
printk("High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
@@ -345,7 +390,8 @@ unsigned long __init setup_memory(void)
void __init numa_kva_reserve(void)
{
- reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+ if (kva_pages)
+ reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
}
void __init zone_sizes_init(void)
@@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr)
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
+
+#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
+/*
+ * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
+ *
+ * These stub functions are needed to compile 32-bit NUMA when SRAT is
+ * not set. There are functions in srat_64.c for parsing this table
+ * and it may be possible to make them common functions.
+ */
+void acpi_numa_slit_init (struct acpi_table_slit *slit)
+{
+ printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
+}
+
+void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
+{
+}
+
+void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
+#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 00000000000..7e8db53528a
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+
+
+int fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *fixup;
+
+#ifdef CONFIG_PNPBIOS
+ if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+ extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+ extern u32 pnp_bios_is_utter_crap;
+ pnp_bios_is_utter_crap = 1;
+ printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+ __asm__ volatile(
+ "movl %0, %%esp\n\t"
+ "jmp *%1\n\t"
+ : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+ panic("do_trap: can't hit this");
+ }
+#endif
+
+ fixup = search_exception_tables(regs->ip);
+ if (fixup) {
+ regs->ip = fixup->fixup;
+ return 1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Need to defined our own search_extable on X86_64 to work around
+ * a B stepping K8 bug.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long value)
+{
+ /* B stepping K8 bug */
+ if ((value >> 32) == 0)
+ value |= 0xffffffffUL << 32;
+
+ while (first <= last) {
+ const struct exception_table_entry *mid;
+ long diff;
+
+ mid = (last - first) / 2 + first;
+ diff = mid->insn - value;
+ if (diff == 0)
+ return mid;
+ else if (diff < 0)
+ first = mid+1;
+ else
+ last = mid-1;
+ }
+ return NULL;
+}
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
deleted file mode 100644
index 0ce4f22a263..00000000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/i386/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
- const struct exception_table_entry *fixup;
-
-#ifdef CONFIG_PNPBIOS
- if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
- {
- extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
- extern u32 pnp_bios_is_utter_crap;
- pnp_bios_is_utter_crap = 1;
- printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
- __asm__ volatile(
- "movl %0, %%esp\n\t"
- "jmp *%1\n\t"
- : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
- panic("do_trap: can't hit this");
- }
-#endif
-
- fixup = search_exception_tables(regs->eip);
- if (fixup) {
- regs->eip = fixup->fixup;
- return 1;
- }
-
- return 0;
-}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
deleted file mode 100644
index 79ac6e7100a..00000000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * linux/arch/x86_64/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/* Simple binary search */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- /* Work around a B stepping K8 bug */
- if ((value >> 32) == 0)
- value |= 0xffffffffUL << 32;
-
- while (first <= last) {
- const struct exception_table_entry *mid;
- long diff;
-
- mid = (last - first) / 2 + first;
- diff = mid->insn - value;
- if (diff == 0)
- return mid;
- else if (diff < 0)
- first = mid+1;
- else
- last = mid-1;
- }
- return NULL;
-}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 00000000000..e28cc5277b1
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,986 @@
+/*
+ * Copyright (C) 1995 Linus Torvalds
+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h> /* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/*
+ * Page fault error code bits
+ * bit 0 == 0 means no page found, 1 means protection fault
+ * bit 1 == 0 means read, 1 means write
+ * bit 2 == 0 means kernel, 1 means user-mode
+ * bit 3 == 1 means use of reserved bit detected
+ * bit 4 == 1 means fault was an instruction fetch
+ */
+#define PF_PROT (1<<0)
+#define PF_WRITE (1<<1)
+#define PF_USER (1<<2)
+#define PF_RSVD (1<<3)
+#define PF_INSTR (1<<4)
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+#ifdef CONFIG_KPROBES
+ int ret = 0;
+
+ /* kprobe_running() needs smp_processor_id() */
+#ifdef CONFIG_X86_32
+ if (!user_mode_vm(regs)) {
+#else
+ if (!user_mode(regs)) {
+#endif
+ preempt_disable();
+ if (kprobe_running() && kprobe_fault_handler(regs, 14))
+ ret = 1;
+ preempt_enable();
+ }
+
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+/*
+ * X86_32
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * X86_64
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner
+ */
+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+ unsigned long error_code)
+{
+ unsigned char *instr;
+ int scan_more = 1;
+ int prefetch = 0;
+ unsigned char *max_instr;
+
+#ifdef CONFIG_X86_32
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return 0;
+#endif
+
+ /* If it was a exec fault on NX page, ignore */
+ if (error_code & PF_INSTR)
+ return 0;
+
+ instr = (unsigned char *)convert_ip_to_linear(current, regs);
+ max_instr = instr + 15;
+
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+ return 0;
+
+ while (scan_more && instr < max_instr) {
+ unsigned char opcode;
+ unsigned char instr_hi;
+ unsigned char instr_lo;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+
+ instr_hi = opcode & 0xf0;
+ instr_lo = opcode & 0x0f;
+ instr++;
+
+ switch (instr_hi) {
+ case 0x20:
+ case 0x30:
+ /*
+ * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+ * In X86_64 long mode, the CPU will signal invalid
+ * opcode if some of these prefixes are present so
+ * X86_64 will never get here anyway
+ */
+ scan_more = ((instr_lo & 7) == 0x6);
+ break;
+#ifdef CONFIG_X86_64
+ case 0x40:
+ /*
+ * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+ * Need to figure out under what instruction mode the
+ * instruction was issued. Could check the LDT for lm,
+ * but for now it's good enough to assume that long
+ * mode only uses well known segments or kernel.
+ */
+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+ break;
+#endif
+ case 0x60:
+ /* 0x64 thru 0x67 are valid prefixes in all modes. */
+ scan_more = (instr_lo & 0xC) == 0x4;
+ break;
+ case 0xF0:
+ /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+ scan_more = !instr_lo || (instr_lo>>1) == 1;
+ break;
+ case 0x00:
+ /* Prefetch instruction is 0x0F0D or 0x0F18 */
+ scan_more = 0;
+
+ if (probe_kernel_address(instr, opcode))
+ break;
+ prefetch = (instr_lo == 0xF) &&
+ (opcode == 0x0D || opcode == 0x18);
+ break;
+ default:
+ scan_more = 0;
+ break;
+ }
+ }
+ return prefetch;
+}
+
+static void force_sig_info_fault(int si_signo, int si_code,
+ unsigned long address, struct task_struct *tsk)
+{
+ siginfo_t info;
+
+ info.si_signo = si_signo;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *)address;
+ force_sig_info(si_signo, &info, tsk);
+}
+
+#ifdef CONFIG_X86_64
+static int bad_address(void *p)
+{
+ unsigned long dummy;
+ return probe_kernel_address((unsigned long *)p, dummy);
+}
+#endif
+
+void dump_pagetable(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ __typeof__(pte_val(__pte(0))) page;
+
+ page = read_cr3();
+ page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+ printk("*pdpt = %016Lx ", page);
+ if ((page >> PAGE_SHIFT) < max_low_pfn
+ && page & _PAGE_PRESENT) {
+ page &= PAGE_MASK;
+ page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+ & (PTRS_PER_PMD - 1)];
+ printk(KERN_CONT "*pde = %016Lx ", page);
+ page &= ~_PAGE_NX;
+ }
+#else
+ printk("*pde = %08lx ", page);
+#endif
+
+ /*
+ * We must not directly access the pte in the highpte
+ * case if the page table is located in highmem.
+ * And let's rather not kmap-atomic the pte, just in case
+ * it's allocated already.
+ */
+ if ((page >> PAGE_SHIFT) < max_low_pfn
+ && (page & _PAGE_PRESENT)
+ && !(page & _PAGE_PSE)) {
+ page &= PAGE_MASK;
+ page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+ & (PTRS_PER_PTE - 1)];
+ printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+ }
+
+ printk("\n");
+#else /* CONFIG_X86_64 */
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = (pgd_t *)read_cr3();
+
+ pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+ pgd += pgd_index(address);
+ if (bad_address(pgd)) goto bad;
+ printk("PGD %lx ", pgd_val(*pgd));
+ if (!pgd_present(*pgd)) goto ret;
+
+ pud = pud_offset(pgd, address);
+ if (bad_address(pud)) goto bad;
+ printk("PUD %lx ", pud_val(*pud));
+ if (!pud_present(*pud)) goto ret;
+
+ pmd = pmd_offset(pud, address);
+ if (bad_address(pmd)) goto bad;
+ printk("PMD %lx ", pmd_val(*pmd));
+ if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (bad_address(pte)) goto bad;
+ printk("PTE %lx", pte_val(*pte));
+ret:
+ printk("\n");
+ return;
+bad:
+ printk("BAD\n");
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+ unsigned index = pgd_index(address);
+ pgd_t *pgd_k;
+ pud_t *pud, *pud_k;
+ pmd_t *pmd, *pmd_k;
+
+ pgd += index;
+ pgd_k = init_mm.pgd + index;
+
+ if (!pgd_present(*pgd_k))
+ return NULL;
+
+ /*
+ * set_pgd(pgd, *pgd_k); here would be useless on PAE
+ * and redundant with the set_pmd() on non-PAE. As would
+ * set_pud.
+ */
+
+ pud = pud_offset(pgd, address);
+ pud_k = pud_offset(pgd_k, address);
+ if (!pud_present(*pud_k))
+ return NULL;
+
+ pmd = pmd_offset(pud, address);
+ pmd_k = pmd_offset(pud_k, address);
+ if (!pmd_present(*pmd_k))
+ return NULL;
+ if (!pmd_present(*pmd)) {
+ set_pmd(pmd, *pmd_k);
+ arch_flush_lazy_mmu_mode();
+ } else
+ BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ return pmd_k;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+ BIOS SMM functions are required to use a specific workaround
+ to avoid corruption of the 64bit RIP register on C stepping K8.
+ A lot of BIOS that didn't get tested properly miss this.
+ The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ Try to work around it here.
+ Note we only handle faults in kernel here.
+ Does nothing for X86_32
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ static int warned;
+ if (address != regs->ip)
+ return 0;
+ if ((address >> 32) != 0)
+ return 0;
+ address |= 0xffffffffUL << 32;
+ if ((address >= (u64)_stext && address <= (u64)_etext) ||
+ (address >= MODULES_VADDR && address <= MODULES_END)) {
+ if (!warned) {
+ printk(errata93_warning);
+ warned = 1;
+ }
+ regs->ip = address;
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * addresses >4GB. We catch this in the page fault handler because these
+ * addresses are not reachable. Just detect this case and return. Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+ if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+ (address >> 32))
+ return 1;
+#endif
+ return 0;
+}
+
+void do_invalid_op(struct pt_regs *, unsigned long);
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+ unsigned long nr;
+ /*
+ * Pentium F0 0F C7 C8 bug workaround.
+ */
+ if (boot_cpu_data.f00f_bug) {
+ nr = (address - idt_descr.address) >> 3;
+
+ if (nr == 6) {
+ do_invalid_op(regs, 0);
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ if (!oops_may_print())
+ return;
+#endif
+
+#ifdef CONFIG_X86_PAE
+ if (error_code & PF_INSTR) {
+ int level;
+ pte_t *pte = lookup_address(address, &level);
+
+ if (pte && pte_present(*pte) && !pte_exec(*pte))
+ printk(KERN_CRIT "kernel tried to execute "
+ "NX-protected page - exploit attempt? "
+ "(uid: %d)\n", current->uid);
+ }
+#endif
+
+ printk(KERN_ALERT "BUG: unable to handle kernel ");
+ if (address < PAGE_SIZE)
+ printk(KERN_CONT "NULL pointer dereference");
+ else
+ printk(KERN_CONT "paging request");
+#ifdef CONFIG_X86_32
+ printk(KERN_CONT " at %08lx\n", address);
+#else
+ printk(KERN_CONT " at %016lx\n", address);
+#endif
+ printk(KERN_ALERT "IP:");
+ printk_address(regs->ip, 1);
+ dump_pagetable(address);
+}
+
+#ifdef CONFIG_X86_64
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ unsigned long flags = oops_begin();
+ struct task_struct *tsk;
+
+ printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+ current->comm, address);
+ dump_pagetable(address);
+ tsk = current;
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+ if (__die("Bad pagetable", regs, error_code))
+ regs = NULL;
+ oops_end(flags, regs, SIGKILL);
+}
+#endif
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry. This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X). Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+ unsigned long error_code)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ /* Reserved-bit violation or user access to kernel space? */
+ if (error_code & (PF_USER | PF_RSVD))
+ return 0;
+
+ pgd = init_mm.pgd + pgd_index(address);
+ if (!pgd_present(*pgd))
+ return 0;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return 0;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return 0;
+
+ pte = pte_offset_kernel(pmd, address);
+ if (!pte_present(*pte))
+ return 0;
+
+ if ((error_code & PF_WRITE) && !pte_write(*pte))
+ return 0;
+ if ((error_code & PF_INSTR) && !pte_exec(*pte))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * X86_32
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * X86_64
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+ unsigned long pgd_paddr;
+ pmd_t *pmd_k;
+ pte_t *pte_k;
+ /*
+ * Synchronize this task's top level page-table
+ * with the 'reference' page table.
+ *
+ * Do _not_ use "current" here. We might be inside
+ * an interrupt in the middle of a task switch..
+ */
+ pgd_paddr = read_cr3();
+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+ if (!pmd_k)
+ return -1;
+ pte_k = pte_offset_kernel(pmd_k, address);
+ if (!pte_present(*pte_k))
+ return -1;
+ return 0;
+#else
+ pgd_t *pgd, *pgd_ref;
+ pud_t *pud, *pud_ref;
+ pmd_t *pmd, *pmd_ref;
+ pte_t *pte, *pte_ref;
+
+ /* Copy kernel mappings over when needed. This can also
+ happen within a race in page table update. In the later
+ case just flush. */
+
+ pgd = pgd_offset(current->mm ?: &init_mm, address);
+ pgd_ref = pgd_offset_k(address);
+ if (pgd_none(*pgd_ref))
+ return -1;
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+ /* Below here mismatches are bugs because these lower tables
+ are shared */
+
+ pud = pud_offset(pgd, address);
+ pud_ref = pud_offset(pgd_ref, address);
+ if (pud_none(*pud_ref))
+ return -1;
+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+ BUG();
+ pmd = pmd_offset(pud, address);
+ pmd_ref = pmd_offset(pud_ref, address);
+ if (pmd_none(*pmd_ref))
+ return -1;
+ if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+ BUG();
+ pte_ref = pte_offset_kernel(pmd_ref, address);
+ if (!pte_present(*pte_ref))
+ return -1;
+ pte = pte_offset_kernel(pmd, address);
+ /* Don't use pte_page here, because the mappings can point
+ outside mem_map, and the NUMA hash lookup cannot handle
+ that. */
+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+ BUG();
+ return 0;
+#endif
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ unsigned long address;
+ int write, si_code;
+ int fault;
+#ifdef CONFIG_X86_64
+ unsigned long flags;
+#endif
+
+ /*
+ * We can fault from pretty much anywhere, with unknown IRQ state.
+ */
+ trace_hardirqs_fixup();
+
+ tsk = current;
+ mm = tsk->mm;
+ prefetchw(&mm->mmap_sem);
+
+ /* get the address */
+ address = read_cr2();
+
+ si_code = SEGV_MAPERR;
+
+ if (notify_page_fault(regs))
+ return;
+
+ /*
+ * We fault-in kernel-space virtual memory on-demand. The
+ * 'reference' page table is init_mm.pgd.
+ *
+ * NOTE! We MUST NOT take any locks for this case. We may
+ * be in an interrupt or a critical region, and should
+ * only copy the information from the master page table,
+ * nothing more.
+ *
+ * This verifies that the fault happens in kernel space
+ * (error_code & 4) == 0, and that the fault was not a
+ * protection error (error_code & 9) == 0.
+ */
+#ifdef CONFIG_X86_32
+ if (unlikely(address >= TASK_SIZE)) {
+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+ vmalloc_fault(address) >= 0)
+ return;
+
+ /* Can handle a stale RO->RW TLB */
+ if (spurious_fault(address, error_code))
+ return;
+
+ /*
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock.
+ */
+ goto bad_area_nosemaphore;
+ }
+
+ /* It's safe to allow irq's after cr2 has been saved and the vmalloc
+ fault has been handled. */
+ if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+ local_irq_enable();
+
+ /*
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault.
+ */
+ if (in_atomic() || !mm)
+ goto bad_area_nosemaphore;
+#else /* CONFIG_X86_64 */
+ if (unlikely(address >= TASK_SIZE64)) {
+ /*
+ * Don't check for the module range here: its PML4
+ * is always initialized because it's shared with the main
+ * kernel text. Only vmalloc may need PML4 syncups.
+ */
+ if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+ ((address >= VMALLOC_START && address < VMALLOC_END))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+ }
+
+ /* Can handle a stale RO->RW TLB */
+ if (spurious_fault(address, error_code))
+ return;
+
+ /*
+ * Don't take the mm semaphore here. If we fixup a prefetch
+ * fault we could otherwise deadlock.
+ */
+ goto bad_area_nosemaphore;
+ }
+ if (likely(regs->flags & X86_EFLAGS_IF))
+ local_irq_enable();
+
+ if (unlikely(error_code & PF_RSVD))
+ pgtable_bad(address, regs, error_code);
+
+ /*
+ * If we're in an interrupt, have no user context or are running in an
+ * atomic region then we must not take the fault.
+ */
+ if (unlikely(in_atomic() || !mm))
+ goto bad_area_nosemaphore;
+
+ /*
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
+ */
+ if (user_mode_vm(regs))
+ error_code |= PF_USER;
+again:
+#endif
+ /* When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in the
+ * kernel and should generate an OOPS. Unfortunately, in the case of an
+ * erroneous fault occurring in a code path which already holds mmap_sem
+ * we will deadlock attempting to validate the fault against the
+ * address space. Luckily the kernel only validly references user
+ * space from well defined areas of code, which are listed in the
+ * exceptions table.
+ *
+ * As the vast majority of faults will be valid we will only perform
+ * the source reference check when there is a possibility of a deadlock.
+ * Attempt to lock the address space, if we cannot we then validate the
+ * source. If this is invalid we can skip the address space check,
+ * thus avoiding the deadlock.
+ */
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ if ((error_code & PF_USER) == 0 &&
+ !search_exception_tables(regs->ip))
+ goto bad_area_nosemaphore;
+ down_read(&mm->mmap_sem);
+ }
+
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto bad_area;
+ if (vma->vm_start <= address)
+ goto good_area;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+ if (error_code & PF_USER) {
+ /*
+ * Accessing the stack below %sp is always a bug.
+ * The large cushion allows instructions like enter
+ * and pusha to work. ("enter $65535,$31" pushes
+ * 32 pointers and then decrements %sp by 65535.)
+ */
+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+ goto bad_area;
+ }
+ if (expand_stack(vma, address))
+ goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+ si_code = SEGV_ACCERR;
+ write = 0;
+ switch (error_code & (PF_PROT|PF_WRITE)) {
+ default: /* 3: write, present */
+ /* fall through */
+ case PF_WRITE: /* write, not present */
+ if (!(vma->vm_flags & VM_WRITE))
+ goto bad_area;
+ write++;
+ break;
+ case PF_PROT: /* read, present */
+ goto bad_area;
+ case 0: /* read, not present */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ goto bad_area;
+ }
+
+#ifdef CONFIG_X86_32
+survive:
+#endif
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+ fault = handle_mm_fault(mm, vma, address, write);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
+ goto out_of_memory;
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
+ }
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+ if (v8086_mode(regs)) {
+ unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+ if (bit < 32)
+ tsk->thread.screen_bitmap |= 1 << bit;
+ }
+#endif
+ up_read(&mm->mmap_sem);
+ return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+ up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+ /* User mode accesses just cause a SIGSEGV */
+ if (error_code & PF_USER) {
+ /*
+ * It's possible to have interrupts off here.
+ */
+ local_irq_enable();
+
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space.
+ */
+ if (is_prefetch(regs, address, error_code))
+ return;
+
+ if (is_errata100(regs, address))
+ return;
+
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
+ printk(
+#ifdef CONFIG_X86_32
+ "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+#else
+ "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
+#endif
+ task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, task_pid_nr(tsk), address, regs->ip,
+ regs->sp, error_code);
+ print_vma_addr(" in ", regs->ip);
+ printk("\n");
+ }
+
+ tsk->thread.cr2 = address;
+ /* Kernel addresses are always protection faults */
+ tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+ return;
+ }
+
+ if (is_f00f_bug(regs, address))
+ return;
+
+no_context:
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs))
+ return;
+
+ /*
+ * X86_32
+ * Valid to do another page fault here, because if this fault
+ * had been triggered by is_prefetch fixup_exception would have
+ * handled it.
+ *
+ * X86_64
+ * Hall of shame of CPU/BIOS bugs.
+ */
+ if (is_prefetch(regs, address, error_code))
+ return;
+
+ if (is_errata93(regs, address))
+ return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+ bust_spinlocks(1);
+#else
+ flags = oops_begin();
+#endif
+
+ show_fault_oops(regs, error_code, address);
+
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+ die("Oops", regs, error_code);
+ bust_spinlocks(0);
+ do_exit(SIGKILL);
+#else
+ if (__die("Oops", regs, error_code))
+ regs = NULL;
+ /* Executive summary in case the body of the oops scrolled away */
+ printk(KERN_EMERG "CR2: %016lx\n", address);
+ oops_end(flags, regs, SIGKILL);
+#endif
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (is_global_init(tsk)) {
+ yield();
+#ifdef CONFIG_X86_32
+ down_read(&mm->mmap_sem);
+ goto survive;
+#else
+ goto again;
+#endif
+ }
+
+ printk("VM: killing process %s\n", tsk->comm);
+ if (error_code & PF_USER)
+ do_group_exit(SIGKILL);
+ goto no_context;
+
+do_sigbus:
+ up_read(&mm->mmap_sem);
+
+ /* Kernel mode? Handle exceptions or die */
+ if (!(error_code & PF_USER))
+ goto no_context;
+#ifdef CONFIG_X86_32
+ /* User space => ok to do another page fault */
+ if (is_prefetch(regs, address, error_code))
+ return;
+#endif
+ tsk->thread.cr2 = address;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = 14;
+ force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Note that races in the updates of insync and start aren't
+ * problematic: insync can only get set bits added, and updates to
+ * start are only improving performance (without affecting correctness
+ * if undone).
+ */
+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+ static unsigned long start = TASK_SIZE;
+ unsigned long address;
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+ for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+ if (!test_bit(pgd_index(address), insync)) {
+ unsigned long flags;
+ struct page *page;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_for_each_entry(page, &pgd_list, lru) {
+ if (!vmalloc_sync_one(page_address(page),
+ address))
+ break;
+ }
+ spin_unlock_irqrestore(&pgd_lock, flags);
+ if (!page)
+ set_bit(pgd_index(address), insync);
+ }
+ if (address == start && test_bit(pgd_index(address), insync))
+ start = address + PGDIR_SIZE;
+ }
+#else /* CONFIG_X86_64 */
+ /*
+ * Note that races in the updates of insync and start aren't
+ * problematic: insync can only get set bits added, and updates to
+ * start are only improving performance (without affecting correctness
+ * if undone).
+ */
+ static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+ static unsigned long start = VMALLOC_START & PGDIR_MASK;
+ unsigned long address;
+
+ for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+ if (!test_bit(pgd_index(address), insync)) {
+ const pgd_t *pgd_ref = pgd_offset_k(address);
+ struct page *page;
+
+ if (pgd_none(*pgd_ref))
+ continue;
+ spin_lock(&pgd_lock);
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ else
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+ }
+ spin_unlock(&pgd_lock);
+ set_bit(pgd_index(address), insync);
+ }
+ if (address == start)
+ start = address + PGDIR_SIZE;
+ }
+ /* Check that there is no need to do the same for the modules area. */
+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ (__START_KERNEL & PGDIR_MASK)));
+#endif
+}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
deleted file mode 100644
index a2273d44aa2..00000000000
--- a/arch/x86/mm/fault_32.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/*
- * linux/arch/i386/mm/fault.c
- *
- * Copyright (C) 1995 Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h> /* For unblank_screen() */
-#include <linux/highmem.h>
-#include <linux/bootmem.h> /* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-
-extern void die(const char *,struct pt_regs *,long);
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- int ret = 0;
-
- /* kprobe_running() needs smp_processor_id() */
- if (!user_mode_vm(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
- ret = 1;
- preempt_enable();
- }
-
- return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- return 0;
-}
-#endif
-
-/*
- * Return EIP plus the CS segment base. The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- *
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
- unsigned long *eip_limit)
-{
- unsigned long eip = regs->eip;
- unsigned seg = regs->xcs & 0xffff;
- u32 seg_ar, seg_limit, base, *desc;
-
- /* Unlikely, but must come before segment checks. */
- if (unlikely(regs->eflags & VM_MASK)) {
- base = seg << 4;
- *eip_limit = base + 0xffff;
- return base + (eip & 0xffff);
- }
-
- /* The standard kernel/user address space limit. */
- *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-
- /* By far the most common cases. */
- if (likely(SEGMENT_IS_FLAT_CODE(seg)))
- return eip;
-
- /* Check the segment exists, is within the current LDT/GDT size,
- that kernel/user (ring 0..3) has the appropriate privilege,
- that it's a code segment, and get the limit. */
- __asm__ ("larl %3,%0; lsll %3,%1"
- : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
- if ((~seg_ar & 0x9800) || eip > seg_limit) {
- *eip_limit = 0;
- return 1; /* So that returned eip > *eip_limit. */
- }
-
- /* Get the GDT/LDT descriptor base.
- When you look for races in this code remember that
- LDT and other horrors are only used in user space. */
- if (seg & (1<<2)) {
- /* Must lock the LDT while reading it. */
- mutex_lock(&current->mm->context.lock);
- desc = current->mm->context.ldt;
- desc = (void *)desc + (seg & ~7);
- } else {
- /* Must disable preemption while reading the GDT. */
- desc = (u32 *)get_cpu_gdt_table(get_cpu());
- desc = (void *)desc + (seg & ~7);
- }
-
- /* Decode the code segment base from the descriptor */
- base = get_desc_base((unsigned long *)desc);
-
- if (seg & (1<<2)) {
- mutex_unlock(&current->mm->context.lock);
- } else
- put_cpu();
-
- /* Adjust EIP and segment limit, and clamp at the kernel limit.
- It's legitimate for segments to wrap at 0xffffffff. */
- seg_limit += base;
- if (seg_limit < *eip_limit && seg_limit >= base)
- *eip_limit = seg_limit;
- return eip + base;
-}
-
-/*
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- */
-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
-{
- unsigned long limit;
- unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
- int scan_more = 1;
- int prefetch = 0;
- int i;
-
- for (i = 0; scan_more && i < 15; i++) {
- unsigned char opcode;
- unsigned char instr_hi;
- unsigned char instr_lo;
-
- if (instr > (unsigned char *)limit)
- break;
- if (probe_kernel_address(instr, opcode))
- break;
-
- instr_hi = opcode & 0xf0;
- instr_lo = opcode & 0x0f;
- instr++;
-
- switch (instr_hi) {
- case 0x20:
- case 0x30:
- /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
- scan_more = ((instr_lo & 7) == 0x6);
- break;
-
- case 0x60:
- /* 0x64 thru 0x67 are valid prefixes in all modes. */
- scan_more = (instr_lo & 0xC) == 0x4;
- break;
- case 0xF0:
- /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
- scan_more = !instr_lo || (instr_lo>>1) == 1;
- break;
- case 0x00:
- /* Prefetch instruction is 0x0F0D or 0x0F18 */
- scan_more = 0;
- if (instr > (unsigned char *)limit)
- break;
- if (probe_kernel_address(instr, opcode))
- break;
- prefetch = (instr_lo == 0xF) &&
- (opcode == 0x0D || opcode == 0x18);
- break;
- default:
- scan_more = 0;
- break;
- }
- }
- return prefetch;
-}
-
-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
- unsigned long error_code)
-{
- if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 >= 6)) {
- /* Catch an obscure case of prefetch inside an NX page. */
- if (nx_enabled && (error_code & 16))
- return 0;
- return __is_prefetch(regs, addr);
- }
- return 0;
-}
-
-static noinline void force_sig_info_fault(int si_signo, int si_code,
- unsigned long address, struct task_struct *tsk)
-{
- siginfo_t info;
-
- info.si_signo = si_signo;
- info.si_errno = 0;
- info.si_code = si_code;
- info.si_addr = (void __user *)address;
- force_sig_info(si_signo, &info, tsk);
-}
-
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
-
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
- unsigned index = pgd_index(address);
- pgd_t *pgd_k;
- pud_t *pud, *pud_k;
- pmd_t *pmd, *pmd_k;
-
- pgd += index;
- pgd_k = init_mm.pgd + index;
-
- if (!pgd_present(*pgd_k))
- return NULL;
-
- /*
- * set_pgd(pgd, *pgd_k); here would be useless on PAE
- * and redundant with the set_pmd() on non-PAE. As would
- * set_pud.
- */
-
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
- if (!pud_present(*pud_k))
- return NULL;
-
- pmd = pmd_offset(pud, address);
- pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- return NULL;
- if (!pmd_present(*pmd)) {
- set_pmd(pmd, *pmd_k);
- arch_flush_lazy_mmu_mode();
- } else
- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
- return pmd_k;
-}
-
-/*
- * Handle a fault on the vmalloc or module mapping area
- *
- * This assumes no large pages in there.
- */
-static inline int vmalloc_fault(unsigned long address)
-{
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "current" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- pgd_paddr = read_cr3();
- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
- if (!pmd_k)
- return -1;
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- return -1;
- return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults. It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- *
- * error_code:
- * bit 0 == 0 means no page found, 1 means protection fault
- * bit 1 == 0 means read, 1 means write
- * bit 2 == 0 means kernel, 1 means user-mode
- * bit 3 == 1 means use of reserved bit detected
- * bit 4 == 1 means fault was an instruction fetch
- */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
- unsigned long error_code)
-{
- struct task_struct *tsk;
- struct mm_struct *mm;
- struct vm_area_struct * vma;
- unsigned long address;
- int write, si_code;
- int fault;
-
- /*
- * We can fault from pretty much anywhere, with unknown IRQ state.
- */
- trace_hardirqs_fixup();
-
- /* get the address */
- address = read_cr2();
-
- tsk = current;
-
- si_code = SEGV_MAPERR;
-
- /*
- * We fault-in kernel-space virtual memory on-demand. The
- * 'reference' page table is init_mm.pgd.
- *
- * NOTE! We MUST NOT take any locks for this case. We may
- * be in an interrupt or a critical region, and should
- * only copy the information from the master page table,
- * nothing more.
- *
- * This verifies that the fault happens in kernel space
- * (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 9) == 0.
- */
- if (unlikely(address >= TASK_SIZE)) {
- if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
- return;
- if (notify_page_fault(regs))
- return;
- /*
- * Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock.
- */
- goto bad_area_nosemaphore;
- }
-
- if (notify_page_fault(regs))
- return;
-
- /* It's safe to allow irq's after cr2 has been saved and the vmalloc
- fault has been handled. */
- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
- local_irq_enable();
-
- mm = tsk->mm;
-
- /*
- * If we're in an interrupt, have no user context or are running in an
- * atomic region then we must not take the fault..
- */
- if (in_atomic() || !mm)
- goto bad_area_nosemaphore;
-
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & 4) == 0 &&
- !search_exception_tables(regs->eip))
- goto bad_area_nosemaphore;
- down_read(&mm->mmap_sem);
- }
-
- vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (vma->vm_start <= address)
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
- if (error_code & 4) {
- /*
- * Accessing the stack below %esp is always a bug.
- * The large cushion allows instructions like enter
- * and pusha to work. ("enter $65535,$31" pushes
- * 32 pointers and then decrements %esp by 65535.)
- */
- if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
- goto bad_area;
- }
- if (expand_stack(vma, address))
- goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
- si_code = SEGV_ACCERR;
- write = 0;
- switch (error_code & 3) {
- default: /* 3: write, present */
- /* fall through */
- case 2: /* write, not present */
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- write++;
- break;
- case 1: /* read, present */
- goto bad_area;
- case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- }
-
- survive:
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
- */
- fault = handle_mm_fault(mm, vma, address, write);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
- }
- if (fault & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
-
- /*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
- if (regs->eflags & VM_MASK) {
- unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.screen_bitmap |= 1 << bit;
- }
- up_read(&mm->mmap_sem);
- return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (error_code & 4) {
- /*
- * It's possible to have interrupts off here.
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk("%s%s[%d]: segfault at %08lx eip %08lx "
- "esp %08lx error %lx\n",
- task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, task_pid_nr(tsk), address, regs->eip,
- regs->esp, error_code);
- }
- tsk->thread.cr2 = address;
- /* Kernel addresses are always protection faults */
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk);
- return;
- }
-
-#ifdef CONFIG_X86_F00F_BUG
- /*
- * Pentium F0 0F C7 C8 bug workaround.
- */
- if (boot_cpu_data.f00f_bug) {
- unsigned long nr;
-
- nr = (address - idt_descr.address) >> 3;
-
- if (nr == 6) {
- do_invalid_op(regs, 0);
- return;
- }
- }
-#endif
-
-no_context:
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
- return;
-
- /*
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- */
- if (is_prefetch(regs, address, error_code))
- return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
- bust_spinlocks(1);
-
- if (oops_may_print()) {
- __typeof__(pte_val(__pte(0))) page;
-
-#ifdef CONFIG_X86_PAE
- if (error_code & 16) {
- pte_t *pte = lookup_address(address);
-
- if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
- printk(KERN_CRIT "kernel tried to execute "
- "NX-protected page - exploit attempt? "
- "(uid: %d)\n", current->uid);
- }
-#endif
- if (address < PAGE_SIZE)
- printk(KERN_ALERT "BUG: unable to handle kernel NULL "
- "pointer dereference");
- else
- printk(KERN_ALERT "BUG: unable to handle kernel paging"
- " request");
- printk(" at virtual address %08lx\n",address);
- printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
-
- page = read_cr3();
- page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
-#ifdef CONFIG_X86_PAE
- printk("*pdpt = %016Lx ", page);
- if ((page >> PAGE_SHIFT) < max_low_pfn
- && page & _PAGE_PRESENT) {
- page &= PAGE_MASK;
- page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
- & (PTRS_PER_PMD - 1)];
- printk(KERN_CONT "*pde = %016Lx ", page);
- page &= ~_PAGE_NX;
- }
-#else
- printk("*pde = %08lx ", page);
-#endif
-
- /*
- * We must not directly access the pte in the highpte
- * case if the page table is located in highmem.
- * And let's rather not kmap-atomic the pte, just in case
- * it's allocated already.
- */
- if ((page >> PAGE_SHIFT) < max_low_pfn
- && (page & _PAGE_PRESENT)
- && !(page & _PAGE_PSE)) {
- page &= PAGE_MASK;
- page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
- & (PTRS_PER_PTE - 1)];
- printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
- }
-
- printk("\n");
- }
-
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
- die("Oops", regs, error_code);
- bust_spinlocks(0);
- do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (is_global_init(tsk)) {
- yield();
- down_read(&mm->mmap_sem);
- goto survive;
- }
- printk("VM: killing process %s\n", tsk->comm);
- if (error_code & 4)
- do_group_exit(SIGKILL);
- goto no_context;
-
-do_sigbus:
- up_read(&mm->mmap_sem);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & 4))
- goto no_context;
-
- /* User space => ok to do another page fault */
- if (is_prefetch(regs, address, error_code))
- return;
-
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
- force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-void vmalloc_sync_all(void)
-{
- /*
- * Note that races in the updates of insync and start aren't
- * problematic: insync can only get set bits added, and updates to
- * start are only improving performance (without affecting correctness
- * if undone).
- */
- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- static unsigned long start = TASK_SIZE;
- unsigned long address;
-
- if (SHARED_KERNEL_PMD)
- return;
-
- BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
- for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
- if (!test_bit(pgd_index(address), insync)) {
- unsigned long flags;
- struct page *page;
-
- spin_lock_irqsave(&pgd_lock, flags);
- for (page = pgd_list; page; page =
- (struct page *)page->index)
- if (!vmalloc_sync_one(page_address(page),
- address)) {
- BUG_ON(page != pgd_list);
- break;
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
- if (!page)
- set_bit(pgd_index(address), insync);
- }
- if (address == start && test_bit(pgd_index(address), insync))
- start = address + PGDIR_SIZE;
- }
-}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
deleted file mode 100644
index 0e26230669c..00000000000
--- a/arch/x86/mm/fault_64.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- * linux/arch/x86-64/mm/fault.c
- *
- * Copyright (C) 1995 Linus Torvalds
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h> /* For unblank_screen() */
-#include <linux/compiler.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm-generic/sections.h>
-
-/* Page fault error code bits */
-#define PF_PROT (1<<0) /* or no page found */
-#define PF_WRITE (1<<1)
-#define PF_USER (1<<2)
-#define PF_RSVD (1<<3)
-#define PF_INSTR (1<<4)
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- int ret = 0;
-
- /* kprobe_running() needs smp_processor_id() */
- if (!user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
- ret = 1;
- preempt_enable();
- }
-
- return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- return 0;
-}
-#endif
-
-/* Sometimes the CPU reports invalid exceptions on prefetch.
- Check that here and ignore.
- Opcode checker based on code by Richard Brunner */
-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
- unsigned long error_code)
-{
- unsigned char *instr;
- int scan_more = 1;
- int prefetch = 0;
- unsigned char *max_instr;
-
- /* If it was a exec fault ignore */
- if (error_code & PF_INSTR)
- return 0;
-
- instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
- max_instr = instr + 15;
-
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
- return 0;
-
- while (scan_more && instr < max_instr) {
- unsigned char opcode;
- unsigned char instr_hi;
- unsigned char instr_lo;
-
- if (probe_kernel_address(instr, opcode))
- break;
-
- instr_hi = opcode & 0xf0;
- instr_lo = opcode & 0x0f;
- instr++;
-
- switch (instr_hi) {
- case 0x20:
- case 0x30:
- /* Values 0x26,0x2E,0x36,0x3E are valid x86
- prefixes. In long mode, the CPU will signal
- invalid opcode if some of these prefixes are
- present so we will never get here anyway */
- scan_more = ((instr_lo & 7) == 0x6);
- break;
-
- case 0x40:
- /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
- Need to figure out under what instruction mode the
- instruction was issued ... */
- /* Could check the LDT for lm, but for now it's good
- enough to assume that long mode only uses well known
- segments or kernel. */
- scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
- break;
-
- case 0x60:
- /* 0x64 thru 0x67 are valid prefixes in all modes. */
- scan_more = (instr_lo & 0xC) == 0x4;
- break;
- case 0xF0:
- /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
- scan_more = !instr_lo || (instr_lo>>1) == 1;
- break;
- case 0x00:
- /* Prefetch instruction is 0x0F0D or 0x0F18 */
- scan_more = 0;
- if (probe_kernel_address(instr, opcode))
- break;
- prefetch = (instr_lo == 0xF) &&
- (opcode == 0x0D || opcode == 0x18);
- break;
- default:
- scan_more = 0;
- break;
- }
- }
- return prefetch;
-}
-
-static int bad_address(void *p)
-{
- unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
-}
-
-void dump_pagetable(unsigned long address)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = (pgd_t *)read_cr3();
-
- pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
- pgd += pgd_index(address);
- if (bad_address(pgd)) goto bad;
- printk("PGD %lx ", pgd_val(*pgd));
- if (!pgd_present(*pgd)) goto ret;
-
- pud = pud_offset(pgd, address);
- if (bad_address(pud)) goto bad;
- printk("PUD %lx ", pud_val(*pud));
- if (!pud_present(*pud)) goto ret;
-
- pmd = pmd_offset(pud, address);
- if (bad_address(pmd)) goto bad;
- printk("PMD %lx ", pmd_val(*pmd));
- if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
-
- pte = pte_offset_kernel(pmd, address);
- if (bad_address(pte)) goto bad;
- printk("PTE %lx", pte_val(*pte));
-ret:
- printk("\n");
- return;
-bad:
- printk("BAD\n");
-}
-
-static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/* Workaround for K8 erratum #93 & buggy BIOS.
- BIOS SMM functions are required to use a specific workaround
- to avoid corruption of the 64bit RIP register on C stepping K8.
- A lot of BIOS that didn't get tested properly miss this.
- The OS sees this as a page fault with the upper 32bits of RIP cleared.
- Try to work around it here.
- Note we only handle faults in kernel here. */
-
-static int is_errata93(struct pt_regs *regs, unsigned long address)
-{
- static int warned;
- if (address != regs->rip)
- return 0;
- if ((address >> 32) != 0)
- return 0;
- address |= 0xffffffffUL << 32;
- if ((address >= (u64)_stext && address <= (u64)_etext) ||
- (address >= MODULES_VADDR && address <= MODULES_END)) {
- if (!warned) {
- printk(errata93_warning);
- warned = 1;
- }
- regs->rip = address;
- return 1;
- }
- return 0;
-}
-
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
- unsigned long error_code)
-{
- unsigned long flags = oops_begin();
- struct task_struct *tsk;
-
- printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
- current->comm, address);
- dump_pagetable(address);
- tsk = current;
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
- __die("Bad pagetable", regs, error_code);
- oops_end(flags);
- do_exit(SIGKILL);
-}
-
-/*
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
- pgd_t *pgd, *pgd_ref;
- pud_t *pud, *pud_ref;
- pmd_t *pmd, *pmd_ref;
- pte_t *pte, *pte_ref;
-
- /* Copy kernel mappings over when needed. This can also
- happen within a race in page table update. In the later
- case just flush. */
-
- pgd = pgd_offset(current->mm ?: &init_mm, address);
- pgd_ref = pgd_offset_k(address);
- if (pgd_none(*pgd_ref))
- return -1;
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
- /* Below here mismatches are bugs because these lower tables
- are shared */
-
- pud = pud_offset(pgd, address);
- pud_ref = pud_offset(pgd_ref, address);
- if (pud_none(*pud_ref))
- return -1;
- if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
- BUG();
- pmd = pmd_offset(pud, address);
- pmd_ref = pmd_offset(pud_ref, address);
- if (pmd_none(*pmd_ref))
- return -1;
- if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
- BUG();
- pte_ref = pte_offset_kernel(pmd_ref, address);
- if (!pte_present(*pte_ref))
- return -1;
- pte = pte_offset_kernel(pmd, address);
- /* Don't use pte_page here, because the mappings can point
- outside mem_map, and the NUMA hash lookup cannot handle
- that. */
- if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
- BUG();
- return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults. It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
- unsigned long error_code)
-{
- struct task_struct *tsk;
- struct mm_struct *mm;
- struct vm_area_struct * vma;
- unsigned long address;
- const struct exception_table_entry *fixup;
- int write, fault;
- unsigned long flags;
- siginfo_t info;
-
- /*
- * We can fault from pretty much anywhere, with unknown IRQ state.
- */
- trace_hardirqs_fixup();
-
- tsk = current;
- mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
-
- /* get the address */
- address = read_cr2();
-
- info.si_code = SEGV_MAPERR;
-
-
- /*
- * We fault-in kernel-space virtual memory on-demand. The
- * 'reference' page table is init_mm.pgd.
- *
- * NOTE! We MUST NOT take any locks for this case. We may
- * be in an interrupt or a critical region, and should
- * only copy the information from the master page table,
- * nothing more.
- *
- * This verifies that the fault happens in kernel space
- * (error_code & 4) == 0, and that the fault was not a
- * protection error (error_code & 9) == 0.
- */
- if (unlikely(address >= TASK_SIZE64)) {
- /*
- * Don't check for the module range here: its PML4
- * is always initialized because it's shared with the main
- * kernel text. Only vmalloc may need PML4 syncups.
- */
- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
- ((address >= VMALLOC_START && address < VMALLOC_END))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
- if (notify_page_fault(regs))
- return;
- /*
- * Don't take the mm semaphore here. If we fixup a prefetch
- * fault we could otherwise deadlock.
- */
- goto bad_area_nosemaphore;
- }
-
- if (notify_page_fault(regs))
- return;
-
- if (likely(regs->eflags & X86_EFLAGS_IF))
- local_irq_enable();
-
- if (unlikely(error_code & PF_RSVD))
- pgtable_bad(address, regs, error_code);
-
- /*
- * If we're in an interrupt or have no user
- * context, we must not take the fault..
- */
- if (unlikely(in_atomic() || !mm))
- goto bad_area_nosemaphore;
-
- /*
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet.
- */
- if (user_mode_vm(regs))
- error_code |= PF_USER;
-
- again:
- /* When running in the kernel we expect faults to occur only to
- * addresses in user space. All other faults represent errors in the
- * kernel and should generate an OOPS. Unfortunately, in the case of an
- * erroneous fault occurring in a code path which already holds mmap_sem
- * we will deadlock attempting to validate the fault against the
- * address space. Luckily the kernel only validly references user
- * space from well defined areas of code, which are listed in the
- * exceptions table.
- *
- * As the vast majority of faults will be valid we will only perform
- * the source reference check when there is a possibility of a deadlock.
- * Attempt to lock the address space, if we cannot we then validate the
- * source. If this is invalid we can skip the address space check,
- * thus avoiding the deadlock.
- */
- if (!down_read_trylock(&mm->mmap_sem)) {
- if ((error_code & PF_USER) == 0 &&
- !search_exception_tables(regs->rip))
- goto bad_area_nosemaphore;
- down_read(&mm->mmap_sem);
- }
-
- vma = find_vma(mm, address);
- if (!vma)
- goto bad_area;
- if (likely(vma->vm_start <= address))
- goto good_area;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto bad_area;
- if (error_code & 4) {
- /* Allow userspace just enough access below the stack pointer
- * to let the 'enter' instruction work.
- */
- if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
- goto bad_area;
- }
- if (expand_stack(vma, address))
- goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
- info.si_code = SEGV_ACCERR;
- write = 0;
- switch (error_code & (PF_PROT|PF_WRITE)) {
- default: /* 3: write, present */
- /* fall through */
- case PF_WRITE: /* write, not present */
- if (!(vma->vm_flags & VM_WRITE))
- goto bad_area;
- write++;
- break;
- case PF_PROT: /* read, present */
- goto bad_area;
- case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
- goto bad_area;
- }
-
- /*
- * If for any reason at all we couldn't handle the fault,
- * make sure we exit gracefully rather than endlessly redo
- * the fault.
- */
- fault = handle_mm_fault(mm, vma, address, write);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- if (fault & VM_FAULT_OOM)
- goto out_of_memory;
- else if (fault & VM_FAULT_SIGBUS)
- goto do_sigbus;
- BUG();
- }
- if (fault & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- up_read(&mm->mmap_sem);
- return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
- up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
- /* User mode accesses just cause a SIGSEGV */
- if (error_code & PF_USER) {
-
- /*
- * It's possible to have interrupts off here.
- */
- local_irq_enable();
-
- if (is_prefetch(regs, address, error_code))
- return;
-
- /* Work around K8 erratum #100 K8 in compat mode
- occasionally jumps to illegal addresses >4GB. We
- catch this here in the page fault handler because
- these addresses are not reachable. Just detect this
- case and return. Any code segment in LDT is
- compatibility mode. */
- if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
- (address >> 32))
- return;
-
- if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
- printk_ratelimit()) {
- printk(
- "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
- tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
- tsk->comm, tsk->pid, address, regs->rip,
- regs->rsp, error_code);
- }
-
- tsk->thread.cr2 = address;
- /* Kernel addresses are always protection faults */
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
- tsk->thread.trap_no = 14;
- info.si_signo = SIGSEGV;
- info.si_errno = 0;
- /* info.si_code has been set above */
- info.si_addr = (void __user *)address;
- force_sig_info(SIGSEGV, &info, tsk);
- return;
- }
-
-no_context:
-
- /* Are we prepared to handle this kernel fault? */
- fixup = search_exception_tables(regs->rip);
- if (fixup) {
- regs->rip = fixup->fixup;
- return;
- }
-
- /*
- * Hall of shame of CPU/BIOS bugs.
- */
-
- if (is_prefetch(regs, address, error_code))
- return;
-
- if (is_errata93(regs, address))
- return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
- flags = oops_begin();
-
- if (address < PAGE_SIZE)
- printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
- else
- printk(KERN_ALERT "Unable to handle kernel paging request");
- printk(" at %016lx RIP: \n" KERN_ALERT,address);
- printk_address(regs->rip);
- dump_pagetable(address);
- tsk->thread.cr2 = address;
- tsk->thread.trap_no = 14;
- tsk->thread.error_code = error_code;
- __die("Oops", regs, error_code);
- /* Executive summary in case the body of the oops scrolled away */
- printk(KERN_EMERG "CR2: %016lx\n", address);
- oops_end(flags);
- do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
- up_read(&mm->mmap_sem);
- if (is_global_init(current)) {
- yield();
- goto again;
- }
- printk("VM: killing process %s\n", tsk->comm);
- if (error_code & 4)
- do_group_exit(SIGKILL);
- goto no_context;
-
-do_sigbus:
- up_read(&mm->mmap_sem);
-
- /* Kernel mode? Handle exceptions or die */
- if (!(error_code & PF_USER))
- goto no_context;
-
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = 14;
- info.si_signo = SIGBUS;
- info.si_errno = 0;
- info.si_code = BUS_ADRERR;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGBUS, &info, tsk);
- return;
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
- /* Note that races in the updates of insync and start aren't
- problematic:
- insync can only get set bits added, and updates to start are only
- improving performance (without affecting correctness if undone). */
- static DECLARE_BITMAP(insync, PTRS_PER_PGD);
- static unsigned long start = VMALLOC_START & PGDIR_MASK;
- unsigned long address;
-
- for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
- if (!test_bit(pgd_index(address), insync)) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
- struct page *page;
-
- if (pgd_none(*pgd_ref))
- continue;
- spin_lock(&pgd_lock);
- list_for_each_entry(page, &pgd_list, lru) {
- pgd_t *pgd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
- }
- spin_unlock(&pgd_lock);
- set_bit(pgd_index(address), insync);
- }
- if (address == start)
- start = address + PGDIR_SIZE;
- }
- /* Check that there is no need to do the same for the modules area. */
- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
- (__START_KERNEL & PGDIR_MASK)));
-}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1c3bf95f735..3d936f23270 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
kunmap_high(page);
}
+static void debug_kmap_atomic_prot(enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+ static unsigned warn_count = 10;
+
+ if (unlikely(warn_count == 0))
+ return;
+
+ if (unlikely(in_interrupt())) {
+ if (in_irq()) {
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (!irqs_disabled()) { /* softirq */
+ if (type != KM_IRQ0 && type != KM_IRQ1 &&
+ type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+ type != KM_SKB_SUNRPC_DATA &&
+ type != KM_SKB_DATA_SOFTIRQ &&
+ type != KM_BOUNCE_READ) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+ }
+
+ if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+ type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+ if (!irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+ if (irq_count() == 0 && !irqs_disabled()) {
+ WARN_ON(1);
+ warn_count--;
+ }
+ }
+#endif
+}
+
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
@@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
enum fixed_addresses idx;
unsigned long vaddr;
-
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+
+ debug_kmap_atomic_prot(type);
+
pagefault_disable();
if (!PageHighMem(page))
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6c06d9c0488..4fbafb4bc2f 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -15,6 +15,7 @@
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
@@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_lock(&mm->page_table_lock);
if (pud_none(*pud))
- pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+ pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
else
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3c76d194fd2..da524fb2242 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,7 +27,6 @@
#include <linux/bootmem.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
-#include <linux/efi.h>
#include <linux/memory_hotplug.h>
#include <linux/initrd.h>
#include <linux/cpumask.h>
@@ -40,8 +39,10 @@
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
+#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
@@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
-static int noinline do_test_wp_bit(void);
+static noinline int do_test_wp_bit(void);
/*
* Creates a middle page table and puts a pointer to it in the
@@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
pud_t *pud;
pmd_t *pmd_table;
-
+
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+ paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
- if (pmd_table != pmd_offset(pud, 0))
- BUG();
+ BUG_ON(pmd_table != pmd_offset(pud, 0));
}
#endif
pud = pud_offset(pgd, 0);
pmd_table = pmd_offset(pud, 0);
+
return pmd_table;
}
/*
* Create a page table and place a pointer to it in a middle page
- * directory entry.
+ * directory entry:
*/
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
@@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
#ifdef CONFIG_DEBUG_PAGEALLOC
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
- if (!page_table)
+ if (!page_table) {
page_table =
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ }
paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
}
/*
- * This function initializes a certain range of kernel virtual memory
+ * This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
* the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space
- * so we can cache the place of the first one and move around without
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
* checking the pgd every time.
*/
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
{
- pgd_t *pgd;
- pmd_t *pmd;
int pgd_idx, pmd_idx;
unsigned long vaddr;
+ pgd_t *pgd;
+ pmd_t *pmd;
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
pmd = pmd + pmd_index(vaddr);
- for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+ for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+ pmd++, pmd_idx++) {
one_page_table_init(pmd);
vaddr += PMD_SIZE;
@@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr)
}
/*
- * This maps the physical memory to kernel virtual address space, a total
- * of max_low_pfn pages, by creating page tables starting from address
- * PAGE_OFFSET.
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
*/
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
+ int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
- int pgd_idx, pmd_idx, pte_ofs;
pgd_idx = pgd_index(PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
@@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
pmd = one_md_table_init(pgd);
if (pfn >= max_low_pfn)
continue;
- for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
- unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
- /* Map with big pages if possible, otherwise create normal page tables. */
+ for (pmd_idx = 0;
+ pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+ pmd++, pmd_idx++) {
+ unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+ /*
+ * Map with big pages if possible, otherwise
+ * create normal page tables:
+ */
if (cpu_has_pse) {
- unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
- if (is_kernel_text(address) || is_kernel_text(address2))
- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
- else
- set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+ unsigned int addr2;
+ pgprot_t prot = PAGE_KERNEL_LARGE;
+
+ addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+ PAGE_OFFSET + PAGE_SIZE-1;
+
+ if (is_kernel_text(addr) ||
+ is_kernel_text(addr2))
+ prot = PAGE_KERNEL_LARGE_EXEC;
+
+ set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
- } else {
- pte = one_page_table_init(pmd);
-
- for (pte_ofs = 0;
- pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
- pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
- if (is_kernel_text(address))
- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
- else
- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
- }
+ continue;
+ }
+ pte = one_page_table_init(pmd);
+
+ for (pte_ofs = 0;
+ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+ pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+ pgprot_t prot = PAGE_KERNEL;
+
+ if (is_kernel_text(addr))
+ prot = PAGE_KERNEL_EXEC;
+
+ set_pte(pte, pfn_pte(pfn, prot));
}
}
}
@@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr)
return 0;
}
-int page_is_ram(unsigned long pagenr)
-{
- int i;
- unsigned long addr, end;
-
- if (efi_enabled) {
- efi_memory_desc_t *md;
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- md = p;
- if (!is_available_memory(md))
- continue;
- addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
- end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
- if ((pagenr >= addr) && (pagenr < end))
- return 1;
- }
- return 0;
- }
-
- for (i = 0; i < e820.nr_map; i++) {
-
- if (e820.map[i].type != E820_RAM) /* not usable memory */
- continue;
- /*
- * !!!FIXME!!! Some BIOSen report areas as RAM that
- * are not. Notably the 640->1Mb area. We need a sanity
- * check here.
- */
- addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
- end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
- if ((pagenr >= addr) && (pagenr < end))
- return 1;
- }
- return 0;
-}
-
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
-#define kmap_get_fixmap_pte(vaddr) \
- pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+ return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+ vaddr), vaddr), vaddr);
+}
static void __init kmap_init(void)
{
unsigned long kmap_vstart;
- /* cache the first kmap pte */
+ /*
+ * Cache the first kmap pte:
+ */
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
@@ -259,11 +241,11 @@ static void __init kmap_init(void)
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
+ unsigned long vaddr;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- unsigned long vaddr;
vaddr = PKMAP_BASE;
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
- pkmap_page_table = pte;
+ pkmap_page_table = pte;
}
static void __meminit free_new_highpage(struct page *page)
@@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
SetPageReserved(page);
}
-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+static int __meminit
+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
{
free_new_highpage(page);
totalram_pages++;
@@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
max_mapnr = max(pfn, max_mapnr);
#endif
num_physpages++;
+
return 0;
}
@@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
* Not currently handling the NUMA case.
* Assuming single node and all memory that
* has been added dynamically that would be
- * onlined here is in HIGHMEM
+ * onlined here is in HIGHMEM.
*/
void __meminit online_page(struct page *page)
{
@@ -314,13 +298,11 @@ void __meminit online_page(struct page *page)
add_one_highpage_hotplug(page, page_to_pfn(page));
}
-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
+#ifndef CONFIG_NUMA
static void __init set_highmem_pages_init(int bad_ppro)
{
int pfn;
+
for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
/*
* Holes under sparsemem might not have no mem_map[]:
@@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro)
}
totalram_pages += totalhigh_pages;
}
-#endif /* CONFIG_FLATMEM */
+#endif /* !CONFIG_NUMA */
#else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define kmap_init() do { } while (0)
+# define permanent_kmaps_init(pgd_base) do { } while (0)
+# define set_highmem_pages_init(bad_ppro) do { } while (0)
#endif /* CONFIG_HIGHMEM */
-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
EXPORT_SYMBOL(__PAGE_KERNEL);
-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
void __init native_pagetable_setup_start(pgd_t *base)
{
@@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
memset(&base[USER_PTRS_PER_PGD], 0,
KERNEL_PGD_PTRS * sizeof(pgd_t));
#else
- paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+ paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
#endif
}
@@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
-static void __init pagetable_init (void)
+static void __init pagetable_init(void)
{
- unsigned long vaddr, end;
pgd_t *pgd_base = swapper_pg_dir;
+ unsigned long vaddr, end;
paravirt_pagetable_setup_start(pgd_base);
@@ -435,9 +412,11 @@ static void __init pagetable_init (void)
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
+ early_ioremap_clear();
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
+ early_ioremap_reset();
permanent_kmaps_init(pgd_base);
@@ -450,7 +429,7 @@ static void __init pagetable_init (void)
* driver might have split up a kernel 4MB mapping.
*/
char __nosavedata swsusp_pg_dir[PAGE_SIZE]
- __attribute__ ((aligned (PAGE_SIZE)));
+ __attribute__ ((aligned(PAGE_SIZE)));
static inline void save_pg_dir(void)
{
@@ -462,7 +441,7 @@ static inline void save_pg_dir(void)
}
#endif
-void zap_low_mappings (void)
+void zap_low_mappings(void)
{
int i;
@@ -474,22 +453,24 @@ void zap_low_mappings (void)
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++)
+ for (i = 0; i < USER_PTRS_PER_PGD; i++) {
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
+ }
flush_tlb_all();
}
-int nx_enabled = 0;
+int nx_enabled;
+
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
#ifdef CONFIG_X86_PAE
-static int disable_nx __initdata = 0;
-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
+static int disable_nx __initdata;
/*
* noexec = on|off
@@ -506,11 +487,14 @@ static int __init noexec_setup(char *str)
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
}
- } else if (!strcmp(str,"off")) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- } else
- return -EINVAL;
+ } else {
+ if (!strcmp(str, "off")) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ } else {
+ return -EINVAL;
+ }
+ }
return 0;
}
@@ -522,6 +506,7 @@ static void __init set_nx(void)
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
@@ -531,35 +516,6 @@ static void __init set_nx(void)
}
}
}
-
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
- pte_t *pte;
- int ret = 1;
-
- if (!nx_enabled)
- goto out;
-
- pte = lookup_address(vaddr);
- BUG_ON(!pte);
-
- if (!pte_exec_kernel(*pte))
- ret = 0;
-
- if (enable)
- pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
- else
- pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
- pte_update_defer(&init_mm, vaddr, pte);
- __flush_tlb_all();
-out:
- return ret;
-}
-
#endif
/*
@@ -574,9 +530,8 @@ void __init paging_init(void)
#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
- printk("NX (Execute Disable) protection: active\n");
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
#endif
-
pagetable_init();
load_cr3(swapper_pg_dir);
@@ -600,10 +555,10 @@ void __init paging_init(void)
* used to involve black magic jumps to work around some nasty CPU bugs,
* but fortunately the switch to using exceptions got rid of all that.
*/
-
static void __init test_wp_bit(void)
{
- printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+ printk(KERN_INFO
+ "Checking if this processor honours the WP bit even in supervisor mode...");
/* Any page-aligned address will do, the test is non-destructive */
__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -611,47 +566,46 @@ static void __init test_wp_bit(void)
clear_fixmap(FIX_WP_TEST);
if (!boot_cpu_data.wp_works_ok) {
- printk("No.\n");
+ printk(KERN_CONT "No.\n");
#ifdef CONFIG_X86_WP_WORKS_OK
- panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+ panic(
+ "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
#endif
} else {
- printk("Ok.\n");
+ printk(KERN_CONT "Ok.\n");
}
}
-static struct kcore_list kcore_mem, kcore_vmalloc;
+static struct kcore_list kcore_mem, kcore_vmalloc;
void __init mem_init(void)
{
- extern int ppro_with_ram_bug(void);
int codesize, reservedpages, datasize, initsize;
- int tmp;
- int bad_ppro;
+ int tmp, bad_ppro;
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
-
bad_ppro = ppro_with_ram_bug();
#ifdef CONFIG_HIGHMEM
/* check that fixmap and pkmap do not overlap */
- if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
- printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+ if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+ printk(KERN_ERR
+ "fixmap and kmap areas overlap - this will crash\n");
printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+ PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
+ FIXADDR_START);
BUG();
}
#endif
-
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
reservedpages = 0;
for (tmp = 0; tmp < max_low_pfn; tmp++)
/*
- * Only count reserved RAM pages
+ * Only count reserved RAM pages:
*/
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
@@ -662,11 +616,12 @@ void __init mem_init(void)
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
VMALLOC_END-VMALLOC_START);
- printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+ printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+ "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
@@ -677,45 +632,46 @@ void __init mem_init(void)
);
#if 1 /* double-sanity-check paranoia */
- printk("virtual kernel memory layout:\n"
- " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ printk(KERN_INFO "virtual kernel memory layout:\n"
+ " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
- " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
- " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
- " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
- " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
- FIXADDR_START, FIXADDR_TOP,
- (FIXADDR_TOP - FIXADDR_START) >> 10,
+ " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
+ " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
+ " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
+ FIXADDR_START, FIXADDR_TOP,
+ (FIXADDR_TOP - FIXADDR_START) >> 10,
#ifdef CONFIG_HIGHMEM
- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
- (LAST_PKMAP*PAGE_SIZE) >> 10,
+ PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+ (LAST_PKMAP*PAGE_SIZE) >> 10,
#endif
- VMALLOC_START, VMALLOC_END,
- (VMALLOC_END - VMALLOC_START) >> 20,
+ VMALLOC_START, VMALLOC_END,
+ (VMALLOC_END - VMALLOC_START) >> 20,
- (unsigned long)__va(0), (unsigned long)high_memory,
- ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+ (unsigned long)__va(0), (unsigned long)high_memory,
+ ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
- (unsigned long)&__init_begin, (unsigned long)&__init_end,
- ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+ (unsigned long)&__init_begin, (unsigned long)&__init_end,
+ ((unsigned long)&__init_end -
+ (unsigned long)&__init_begin) >> 10,
- (unsigned long)&_etext, (unsigned long)&_edata,
- ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+ (unsigned long)&_etext, (unsigned long)&_edata,
+ ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
- (unsigned long)&_text, (unsigned long)&_etext,
- ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+ (unsigned long)&_text, (unsigned long)&_etext,
+ ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
#ifdef CONFIG_HIGHMEM
- BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
- BUG_ON(VMALLOC_END > PKMAP_BASE);
+ BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+ BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
- BUG_ON(VMALLOC_START > VMALLOC_END);
- BUG_ON((unsigned long)high_memory > VMALLOC_START);
+ BUG_ON(VMALLOC_START > VMALLOC_END);
+ BUG_ON((unsigned long)high_memory > VMALLOC_START);
#endif /* double-sanity-check paranoia */
#ifdef CONFIG_X86_PAE
@@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size)
return __add_pages(zone, start_pfn, nr_pages);
}
-
#endif
-struct kmem_cache *pmd_cache;
-
-void __init pgtable_cache_init(void)
-{
- if (PTRS_PER_PMD > 1)
- pmd_cache = kmem_cache_create("pmd",
- PTRS_PER_PMD*sizeof(pmd_t),
- PTRS_PER_PMD*sizeof(pmd_t),
- SLAB_PANIC,
- pmd_ctor);
-}
-
/*
* This function cannot be __init, since exceptions don't work in that
* section. Put this after the callers, so that it cannot be inlined.
*/
-static int noinline do_test_wp_bit(void)
+static noinline int do_test_wp_bit(void)
{
char tmp_reg;
int flag;
__asm__ __volatile__(
- " movb %0,%1 \n"
- "1: movb %1,%0 \n"
- " xorl %2,%2 \n"
+ " movb %0, %1 \n"
+ "1: movb %1, %0 \n"
+ " xorl %2, %2 \n"
"2: \n"
- ".section __ex_table,\"a\"\n"
+ ".section __ex_table, \"a\"\n"
" .align 4 \n"
- " .long 1b,2b \n"
+ " .long 1b, 2b \n"
".previous \n"
:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
"=q" (tmp_reg),
"=r" (flag)
:"2" (1)
:"memory");
-
+
return flag;
}
#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
void mark_rodata_ro(void)
{
@@ -801,32 +746,58 @@ void mark_rodata_ro(void)
if (num_possible_cpus() <= 1)
#endif
{
- change_page_attr(virt_to_page(start),
- size >> PAGE_SHIFT, PAGE_KERNEL_RX);
- printk("Write protecting the kernel text: %luk\n", size >> 10);
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
}
#endif
start += size;
size = (unsigned long)__end_rodata - start;
- change_page_attr(virt_to_page(start),
- size >> PAGE_SHIFT, PAGE_KERNEL_RO);
- printk("Write protecting the kernel read-only data: %luk\n",
- size >> 10);
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+ size >> 10);
+ rodata_test();
- /*
- * change_page_attr() requires a global_flush_tlb() call after it.
- * We do this after the printk so that if something went wrong in the
- * change, the printk gets out at least to give a better debug hint
- * of who is the culprit.
- */
- global_flush_tlb();
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+ set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
}
#endif
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
unsigned long addr;
+ /*
+ * We just marked the kernel text read only above, now that
+ * we are going to free part of that, we need to make that
+ * writeable first.
+ */
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
@@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
totalram_pages++;
}
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+#endif
}
void free_initmem(void)
@@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
free_init_pages("initrd memory", start, end);
}
#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f9c8c89065..cc50a13ce8d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,12 +43,10 @@
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-const struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops *dma_ops;
EXPORT_SYMBOL(dma_ops);
static unsigned long dma_reserve __initdata;
@@ -65,22 +63,26 @@ void show_mem(void)
{
long i, total = 0, reserved = 0;
long shared = 0, cached = 0;
- pg_data_t *pgdat;
struct page *page;
+ pg_data_t *pgdat;
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+ printk(KERN_INFO "Free swap: %6ldkB\n",
+ nr_swap_pages << (PAGE_SHIFT-10));
for_each_online_pgdat(pgdat) {
- for (i = 0; i < pgdat->node_spanned_pages; ++i) {
- /* this loop can take a while with 256 GB and 4k pages
- so update the NMI watchdog */
- if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+ for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ /*
+ * This loop can take a while with 256 GB and
+ * 4k pages so defer the NMI watchdog:
+ */
+ if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
touch_nmi_watchdog();
- }
+
if (!pfn_valid(pgdat->node_start_pfn + i))
continue;
+
page = pfn_to_page(pgdat->node_start_pfn + i);
total++;
if (PageReserved(page))
@@ -89,51 +91,58 @@ void show_mem(void)
cached++;
else if (page_count(page))
shared += page_count(page) - 1;
- }
+ }
}
- printk(KERN_INFO "%lu pages of RAM\n", total);
- printk(KERN_INFO "%lu reserved pages\n",reserved);
- printk(KERN_INFO "%lu pages shared\n",shared);
- printk(KERN_INFO "%lu pages swap cached\n",cached);
+ printk(KERN_INFO "%lu pages of RAM\n", total);
+ printk(KERN_INFO "%lu reserved pages\n", reserved);
+ printk(KERN_INFO "%lu pages shared\n", shared);
+ printk(KERN_INFO "%lu pages swap cached\n", cached);
}
int after_bootmem;
static __init void *spp_getpage(void)
-{
+{
void *ptr;
+
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
- if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
- panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
- Dprintk("spp_getpage %p\n", ptr);
+ if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+ panic("set_pte_phys: cannot allocate page data %s\n",
+ after_bootmem ? "after bootmem" : "");
+ }
+
+ pr_debug("spp_getpage %p\n", ptr);
+
return ptr;
-}
+}
-static __init void set_pte_phys(unsigned long vaddr,
- unsigned long phys, pgprot_t prot)
+static __init void
+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte, new_pte;
- Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+ pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
pgd = pgd_offset_k(vaddr);
if (pgd_none(*pgd)) {
- printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+ printk(KERN_ERR
+ "PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
- pmd = (pmd_t *) spp_getpage();
+ pmd = (pmd_t *) spp_getpage();
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
if (pmd != pmd_offset(pud, 0)) {
- printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pmd, pmd_offset(pud, 0));
return;
}
}
@@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr,
pte = (pte_t *) spp_getpage();
set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
if (pte != pte_offset_kernel(pmd, 0)) {
- printk("PAGETABLE BUG #02!\n");
+ printk(KERN_ERR "PAGETABLE BUG #02!\n");
return;
}
}
@@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr,
}
/* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init
+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
- printk("Invalid __set_fixmap\n");
+ printk(KERN_ERR "Invalid __set_fixmap\n");
return;
}
set_pte_phys(address, phys, prot);
}
-unsigned long __meminitdata table_start, table_end;
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
static __meminit void *alloc_low_page(unsigned long *phys)
-{
+{
unsigned long pfn = table_end++;
void *adr;
if (after_bootmem) {
adr = (void *)get_zeroed_page(GFP_ATOMIC);
*phys = __pa(adr);
+
return adr;
}
- if (pfn >= end_pfn)
- panic("alloc_low_page: ran out of memory");
+ if (pfn >= end_pfn)
+ panic("alloc_low_page: ran out of memory");
adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
memset(adr, 0, PAGE_SIZE);
@@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys)
}
static __meminit void unmap_low_page(void *adr)
-{
-
+{
if (after_bootmem)
return;
early_iounmap(adr, PAGE_SIZE);
-}
+}
/* Must run before zap_low_mappings */
__meminit void *early_ioremap(unsigned long addr, unsigned long size)
{
- unsigned long vaddr;
pmd_t *pmd, *last_pmd;
+ unsigned long vaddr;
int i, pmds;
pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
vaddr = __START_KERNEL_map;
pmd = level2_kernel_pgt;
last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+
for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
for (i = 0; i < pmds; i++) {
if (pmd_present(pmd[i]))
- goto next;
+ goto continue_outer_loop;
}
vaddr += addr & ~PMD_MASK;
addr &= PMD_MASK;
+
for (i = 0; i < pmds; i++, addr += PMD_SIZE)
- set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
- __flush_tlb();
+ set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+ __flush_tlb_all();
+
return (void *)vaddr;
- next:
+continue_outer_loop:
;
}
- printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+ printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
+
return NULL;
}
-/* To avoid virtual aliases later */
+/*
+ * To avoid virtual aliases later:
+ */
__meminit void early_iounmap(void *addr, unsigned long size)
{
unsigned long vaddr;
@@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size)
vaddr = (unsigned long)addr;
pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
pmd = level2_kernel_pgt + pmd_index(vaddr);
+
for (i = 0; i < pmds; i++)
pmd_clear(pmd + i);
- __flush_tlb();
+
+ __flush_tlb_all();
}
static void __meminit
@@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
pmd_t *pmd = pmd_page + pmd_index(address);
if (address >= end) {
- if (!after_bootmem)
+ if (!after_bootmem) {
for (; i < PTRS_PER_PMD; i++, pmd++)
set_pmd(pmd, __pmd(0));
+ }
break;
}
if (pmd_val(*pmd))
continue;
- entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+ entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
entry &= __supported_pte_mask;
set_pmd(pmd, __pmd(entry));
}
@@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
static void __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
- pmd_t *pmd = pmd_offset(pud,0);
+ pmd_t *pmd = pmd_offset(pud, 0);
spin_lock(&init_mm.page_table_lock);
phys_pmd_init(pmd, address, end);
spin_unlock(&init_mm.page_table_lock);
__flush_tlb_all();
}
-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
-{
+static void __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
int i = pud_index(addr);
-
- for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
@@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
if (addr >= end)
break;
- if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
- set_pud(pud, __pud(0));
+ if (!after_bootmem &&
+ !e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
+ set_pud(pud, __pud(0));
continue;
- }
+ }
if (pud_val(*pud)) {
phys_pmd_update(pud, addr, end);
@@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
}
pmd = alloc_low_page(&pmd_phys);
+
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
phys_pmd_init(pmd, addr, end);
spin_unlock(&init_mm.page_table_lock);
+
unmap_low_page(pmd);
}
- __flush_tlb();
-}
+ __flush_tlb_all();
+}
static void __init find_early_table_space(unsigned long end)
{
@@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end)
tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
- /* RED-PEN putting page tables only on node 0 could
- cause a hotspot and fill up ZONE_DMA. The page tables
- need roughly 0.5KB per GB. */
- start = 0x8000;
- table_start = find_e820_area(start, end, tables);
+ /*
+ * RED-PEN putting page tables only on node 0 could
+ * cause a hotspot and fill up ZONE_DMA. The page tables
+ * need roughly 0.5KB per GB.
+ */
+ start = 0x8000;
+ table_start = find_e820_area(start, end, tables);
if (table_start == -1UL)
panic("Cannot find space for the kernel page tables");
+ /*
+ * When you have a lot of RAM like 256GB, early_table will not fit
+ * into 0x8000 range, find_e820_area() will find area after kernel
+ * bss but the table_start is not page aligned, so need to round it
+ * up to avoid overlap with bss:
+ */
+ table_start = round_up(table_start, PAGE_SIZE);
table_start >>= PAGE_SHIFT;
table_end = table_start;
@@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end)
(table_start << PAGE_SHIFT) + tables);
}
-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
- This runs before bootmem is initialized and gets pages directly from the
- physical memory. To access them they are temporarily mapped. */
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
-{
- unsigned long next;
+{
+ unsigned long next;
- Dprintk("init_memory_mapping\n");
+ pr_debug("init_memory_mapping\n");
- /*
+ /*
* Find space for the kernel direct mapping tables.
- * Later we should allocate these tables in the local node of the memory
- * mapped. Unfortunately this is done currently before the nodes are
- * discovered.
+ *
+ * Later we should allocate these tables in the local node of the
+ * memory mapped. Unfortunately this is done currently before the
+ * nodes are discovered.
*/
if (!after_bootmem)
find_early_table_space(end);
@@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
end = (unsigned long)__va(end);
for (; start < end; start = next) {
- unsigned long pud_phys;
pgd_t *pgd = pgd_offset_k(start);
+ unsigned long pud_phys;
pud_t *pud;
if (after_bootmem)
@@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
pud = alloc_low_page(&pud_phys);
next = start + PGDIR_SIZE;
- if (next > end)
- next = end;
+ if (next > end)
+ next = end;
phys_pud_init(pud, __pa(start), __pa(next));
if (!after_bootmem)
set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(pud);
- }
+ }
if (!after_bootmem)
mmu_cr4_features = read_cr4();
__flush_tlb_all();
+
+ reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
}
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
+
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -402,39 +439,48 @@ void __init paging_init(void)
}
#endif
-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
- from the CPU leading to inconsistent cache lines. address and size
- must be aligned to 2MB boundaries.
- Does nothing when the mapping doesn't exist. */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size)
+/*
+ * Unmap a kernel mapping if it exists. This is useful to avoid
+ * prefetches from the CPU leading to inconsistent cache lines.
+ * address and size must be aligned to 2MB boundaries.
+ * Does nothing when the mapping doesn't exist.
+ */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size)
{
unsigned long end = address + size;
BUG_ON(address & ~LARGE_PAGE_MASK);
- BUG_ON(size & ~LARGE_PAGE_MASK);
-
- for (; address < end; address += LARGE_PAGE_SIZE) {
+ BUG_ON(size & ~LARGE_PAGE_MASK);
+
+ for (; address < end; address += LARGE_PAGE_SIZE) {
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
pmd_t *pmd;
+
if (pgd_none(*pgd))
continue;
+
pud = pud_offset(pgd, address);
if (pud_none(*pud))
- continue;
+ continue;
+
pmd = pmd_offset(pud, address);
if (!pmd || pmd_none(*pmd))
- continue;
- if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
- /* Could handle this, but it should not happen currently. */
- printk(KERN_ERR
- "clear_kernel_mapping: mapping has been split. will leak memory\n");
- pmd_ERROR(*pmd);
+ continue;
+
+ if (!(pmd_val(*pmd) & _PAGE_PSE)) {
+ /*
+ * Could handle this, but it should not happen
+ * currently:
+ */
+ printk(KERN_ERR "clear_kernel_mapping: "
+ "mapping has been split. will leak memory\n");
+ pmd_ERROR(*pmd);
}
- set_pmd(pmd, __pmd(0));
+ set_pmd(pmd, __pmd(0));
}
__flush_tlb_all();
-}
+}
/*
* Memory hotplug specific functions
@@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, (start + size -1));
+ init_memory_mapping(start, start + size-1);
ret = __add_pages(zone, start_pfn, nr_pages);
- if (ret)
- goto error;
+ WARN_ON(1);
return ret;
-error:
- printk("%s: Problem encountered in __add_pages!\n", __func__);
- return ret;
}
EXPORT_SYMBOL_GPL(arch_add_memory);
@@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
- int err = -EIO;
- unsigned long pfn;
- unsigned long total = 0, mem = 0;
- for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
- if (pfn_valid(pfn)) {
- online_page(pfn_to_page(pfn));
- err = 0;
- mem++;
- }
- total++;
- }
- if (!err) {
- z->spanned_pages += total;
- z->present_pages += mem;
- z->zone_pgdat->node_spanned_pages += total;
- z->zone_pgdat->node_present_pages += mem;
- }
- return err;
-}
-#endif
-
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
- kcore_vsyscall;
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+ kcore_modules, kcore_vsyscall;
void __init mem_init(void)
{
@@ -521,8 +535,15 @@ void __init mem_init(void)
pci_iommu_alloc();
- /* clear the zero-page */
- memset(empty_zero_page, 0, PAGE_SIZE);
+ /* clear_bss() already clear the empty_zero_page */
+
+ /* temporary debugging - double check it's true: */
+ {
+ int i;
+
+ for (i = 0; i < 1024; i++)
+ WARN_ON_ONCE(empty_zero_page[i]);
+ }
reservedpages = 0;
@@ -534,7 +555,6 @@ void __init mem_init(void)
#endif
reservedpages = end_pfn - totalram_pages -
absent_pages_in_range(0, end_pfn);
-
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
@@ -542,15 +562,16 @@ void __init mem_init(void)
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
/* Register memory areas for /proc/kcore */
- kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
- kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
+ kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+ kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
VMALLOC_END-VMALLOC_START);
kclist_add(&kcore_kernel, &_stext, _end - _stext);
kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
- kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
VSYSCALL_END - VSYSCALL_START);
- printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+ printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+ "%ldk reserved, %ldk data, %ldk init)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
end_pfn << (PAGE_SHIFT-10),
codesize >> 10,
@@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
if (begin >= end)
return;
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
memset((void *)(addr & ~(PAGE_SIZE-1)),
POISON_FREE_INITMEM, PAGE_SIZE);
- if (addr >= __START_KERNEL_map)
- change_page_attr_addr(addr, 1, __pgprot(0));
free_page(addr);
totalram_pages++;
}
- if (addr > __START_KERNEL_map)
- global_flush_tlb();
+#endif
}
void free_initmem(void)
@@ -589,6 +618,8 @@ void free_initmem(void)
}
#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
void mark_rodata_ro(void)
{
@@ -603,25 +634,27 @@ void mark_rodata_ro(void)
#ifdef CONFIG_KPROBES
start = (unsigned long)__start_rodata;
#endif
-
+
end = (unsigned long)__end_rodata;
start = (start + PAGE_SIZE - 1) & PAGE_MASK;
end &= PAGE_MASK;
if (end <= start)
return;
- change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+ set_memory_ro(start, (end - start) >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
- /*
- * change_page_attr_addr() requires a global_flush_tlb() call after it.
- * We do this after the printk so that if something went wrong in the
- * change, the printk gets out at least to give a better debug hint
- * of who is the culprit.
- */
- global_flush_tlb();
+ rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+ printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+ set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+ printk(KERN_INFO "Testing CPA: again\n");
+ set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
}
#endif
@@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
-{
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
#ifdef CONFIG_NUMA
int nid = phys_to_nid(phys);
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
+
if (pfn >= end_pfn) {
- /* This can happen with kdump kernels when accessing firmware
- tables. */
+ /*
+ * This can happen with kdump kernels when accessing
+ * firmware tables:
+ */
if (pfn < end_pfn_map)
return;
+
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
phys, len);
return;
@@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
- reserve_bootmem_node(NODE_DATA(nid), phys, len);
-#else
- reserve_bootmem(phys, len);
+ reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else
+ reserve_bootmem(phys, len);
#endif
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
@@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
}
}
-int kern_addr_valid(unsigned long addr)
-{
+int kern_addr_valid(unsigned long addr)
+{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
if (above != 0 && above != -1UL)
- return 0;
-
+ return 0;
+
pgd = pgd_offset_k(addr);
if (pgd_none(*pgd))
return 0;
pud = pud_offset(pgd, addr);
if (pud_none(*pud))
- return 0;
+ return 0;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return 0;
+
if (pmd_large(*pmd))
return pfn_valid(pmd_pfn(*pmd));
pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte))
return 0;
+
return pfn_valid(pte_pfn(*pte));
}
-/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
- covers the 64bit vsyscall page now. 32bit has a real VMA now and does
- not need special handling anymore. */
-
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page. This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
static struct vm_area_struct gate_vma = {
- .vm_start = VSYSCALL_START,
- .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
- .vm_page_prot = PAGE_READONLY_EXEC,
- .vm_flags = VM_READ | VM_EXEC
+ .vm_start = VSYSCALL_START,
+ .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+ .vm_page_prot = PAGE_READONLY_EXEC,
+ .vm_flags = VM_READ | VM_EXEC
};
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
int in_gate_area(struct task_struct *task, unsigned long addr)
{
struct vm_area_struct *vma = get_gate_vma(task);
+
if (!vma)
return 0;
+
return (addr >= vma->vm_start) && (addr < vma->vm_end);
}
-/* Use this when you have no reliable task/vma, typically from interrupt
- * context. It is less reliable than using the task's vma and may give
- * false positives.
+/*
+ * Use this when you have no reliable task/vma, typically from interrupt
+ * context. It is less reliable than using the task's vma and may give
+ * false positives:
*/
int in_gate_area_no_task(unsigned long addr)
{
@@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma)
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
*/
-int __meminit vmemmap_populate(struct page *start_page,
- unsigned long size, int node)
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
{
unsigned long addr = (unsigned long)start_page;
unsigned long end = (unsigned long)(start_page + size);
@@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page,
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
+
pud = vmemmap_pud_populate(pgd, addr, node);
if (!pud)
return -ENOMEM;
@@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page,
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
pte_t entry;
- void *p = vmemmap_alloc_block(PMD_SIZE, node);
+ void *p;
+
+ p = vmemmap_alloc_block(PMD_SIZE, node);
if (!p)
return -ENOMEM;
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
- mk_pte_huge(entry);
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
addr, addr + PMD_SIZE - 1, p, node);
- } else
+ } else {
vmemmap_verify((pte_t *)pmd, node, addr, next);
+ }
}
-
return 0;
}
#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 00000000000..ed795721ca8
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,501 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+enum ioremap_mode {
+ IOR_MODE_UNCACHED,
+ IOR_MODE_CACHED,
+};
+
+#ifdef CONFIG_X86_64
+
+unsigned long __phys_addr(unsigned long x)
+{
+ if (x >= __START_KERNEL_map)
+ return x - __START_KERNEL_map + phys_base;
+ return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+#endif
+
+int page_is_ram(unsigned long pagenr)
+{
+ unsigned long addr, end;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ /*
+ * Not usable memory:
+ */
+ if (e820.map[i].type != E820_RAM)
+ continue;
+ addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
+ end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
+
+ /*
+ * Sanity check: Some BIOSen report areas as RAM that
+ * are not. Notably the 640->1Mb area, which is the
+ * PCI BIOS area.
+ */
+ if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
+ end < (BIOS_END >> PAGE_SHIFT))
+ continue;
+
+ if ((pagenr >= addr) && (pagenr < end))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long paddr, unsigned long size,
+ enum ioremap_mode mode)
+{
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ unsigned long nrpages = size >> PAGE_SHIFT;
+ int err, level;
+
+ /* No change for pages after the last mapping */
+ if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
+ return 0;
+
+ /*
+ * If there is no identity map for this address,
+ * change_page_attr_addr is unnecessary
+ */
+ if (!lookup_address(vaddr, &level))
+ return 0;
+
+ switch (mode) {
+ case IOR_MODE_UNCACHED:
+ default:
+ err = set_memory_uc(vaddr, nrpages);
+ break;
+ case IOR_MODE_CACHED:
+ err = set_memory_wb(vaddr, nrpages);
+ break;
+ }
+
+ return err;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
+ enum ioremap_mode mode)
+{
+ void __iomem *addr;
+ struct vm_struct *area;
+ unsigned long offset, last_addr;
+ pgprot_t prot;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
+ return NULL;
+
+ /*
+ * Don't remap the low PCI/ISA area, it's always mapped..
+ */
+ if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+ return (__force void __iomem *)phys_to_virt(phys_addr);
+
+ /*
+ * Don't allow anybody to remap normal RAM that we're using..
+ */
+ for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
+ (offset << PAGE_SHIFT) < last_addr; offset++) {
+ if (page_is_ram(offset))
+ return NULL;
+ }
+
+ switch (mode) {
+ case IOR_MODE_UNCACHED:
+ default:
+ prot = PAGE_KERNEL_NOCACHE;
+ break;
+ case IOR_MODE_CACHED:
+ prot = PAGE_KERNEL;
+ break;
+ }
+
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+ /*
+ * Ok, go for it..
+ */
+ area = get_vm_area(size, VM_IOREMAP);
+ if (!area)
+ return NULL;
+ area->phys_addr = phys_addr;
+ addr = (void __iomem *) area->addr;
+ if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+ phys_addr, prot)) {
+ remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+ return NULL;
+ }
+
+ if (ioremap_change_attr(phys_addr, size, mode) < 0) {
+ vunmap(addr);
+ return NULL;
+ }
+
+ return (void __iomem *) (offset + (char __iomem *)addr);
+}
+
+/**
+ * ioremap_nocache - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
+{
+ return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
+{
+ return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+ struct vm_struct *p, *o;
+
+ if ((void __force *)addr <= high_memory)
+ return;
+
+ /*
+ * __ioremap special-cases the PCI/ISA range by not instantiating a
+ * vm_area and by simply returning an address into the kernel mapping
+ * of ISA space. So handle that here.
+ */
+ if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+ addr < phys_to_virt(ISA_END_ADDRESS))
+ return;
+
+ addr = (volatile void __iomem *)
+ (PAGE_MASK & (unsigned long __force)addr);
+
+ /* Use the vm area unlocked, assuming the caller
+ ensures there isn't another iounmap for the same address
+ in parallel. Reuse of the virtual address is prevented by
+ leaving it in the global lists until we're done with it.
+ cpa takes care of the direct mappings. */
+ read_lock(&vmlist_lock);
+ for (p = vmlist; p; p = p->next) {
+ if (p->addr == addr)
+ break;
+ }
+ read_unlock(&vmlist_lock);
+
+ if (!p) {
+ printk(KERN_ERR "iounmap: bad address %p\n", addr);
+ dump_stack();
+ return;
+ }
+
+ /* Reset the direct mapping. Can block */
+ ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
+
+ /* Finally remove it */
+ o = remove_vm_area((void *)addr);
+ BUG_ON(p != o || o == NULL);
+ kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifdef CONFIG_X86_32
+
+int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static __initdata unsigned long bm_pte[1024]
+ __attribute__((aligned(PAGE_SIZE)));
+
+static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
+{
+ return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+}
+
+static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
+{
+ return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+}
+
+void __init early_ioremap_init(void)
+{
+ unsigned long *pgd;
+
+ if (early_ioremap_debug)
+ printk(KERN_INFO "early_ioremap_init()\n");
+
+ pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+ *pgd = __pa(bm_pte) | _PAGE_TABLE;
+ memset(bm_pte, 0, sizeof(bm_pte));
+ /*
+ * The boot-ioremap range spans multiple pgds, for which
+ * we are not prepared:
+ */
+ if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
+ WARN_ON(1);
+ printk(KERN_WARNING "pgd %p != %p\n",
+ pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+ fix_to_virt(FIX_BTMAP_BEGIN));
+ printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
+ fix_to_virt(FIX_BTMAP_END));
+
+ printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
+ printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
+ FIX_BTMAP_BEGIN);
+ }
+}
+
+void __init early_ioremap_clear(void)
+{
+ unsigned long *pgd;
+
+ if (early_ioremap_debug)
+ printk(KERN_INFO "early_ioremap_clear()\n");
+
+ pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+ *pgd = 0;
+ paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
+ __flush_tlb_all();
+}
+
+void __init early_ioremap_reset(void)
+{
+ enum fixed_addresses idx;
+ unsigned long *pte, phys, addr;
+
+ after_paging_init = 1;
+ for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
+ addr = fix_to_virt(idx);
+ pte = early_ioremap_pte(addr);
+ if (!*pte & _PAGE_PRESENT) {
+ phys = *pte & PAGE_MASK;
+ set_fixmap(idx, phys);
+ }
+ }
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys, pgprot_t flags)
+{
+ unsigned long *pte, addr = __fix_to_virt(idx);
+
+ if (idx >= __end_of_fixed_addresses) {
+ BUG();
+ return;
+ }
+ pte = early_ioremap_pte(addr);
+ if (pgprot_val(flags))
+ *pte = (phys & PAGE_MASK) | pgprot_val(flags);
+ else
+ *pte = 0;
+ __flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+ unsigned long phys)
+{
+ if (after_paging_init)
+ set_fixmap(idx, phys);
+ else
+ __early_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+ if (after_paging_init)
+ clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+
+int __initdata early_ioremap_nested;
+
+static int __init check_early_ioremap_leak(void)
+{
+ if (!early_ioremap_nested)
+ return 0;
+
+ printk(KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n",
+ early_ioremap_nested);
+ printk(KERN_WARNING
+ "please boot with early_ioremap_debug and report the dmesg.\n");
+ WARN_ON(1);
+
+ return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+ unsigned long offset, last_addr;
+ unsigned int nrpages, nesting;
+ enum fixed_addresses idx0, idx;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ nesting = early_ioremap_nested;
+ if (early_ioremap_debug) {
+ printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
+ phys_addr, size, nesting);
+ dump_stack();
+ }
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ if (nesting >= FIX_BTMAPS_NESTING) {
+ WARN_ON(1);
+ return NULL;
+ }
+ early_ioremap_nested++;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (nrpages > NR_FIX_BTMAPS) {
+ WARN_ON(1);
+ return NULL;
+ }
+
+ /*
+ * Ok, go for it..
+ */
+ idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ idx = idx0;
+ while (nrpages > 0) {
+ early_set_fixmap(idx, phys_addr);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ if (early_ioremap_debug)
+ printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+
+ return (void *) (offset + fix_to_virt(idx0));
+}
+
+void __init early_iounmap(void *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ unsigned int nesting;
+
+ nesting = --early_ioremap_nested;
+ WARN_ON(nesting < 0);
+
+ if (early_ioremap_debug) {
+ printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+ size, nesting);
+ dump_stack();
+ }
+
+ virt_addr = (unsigned long)addr;
+ if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+ WARN_ON(1);
+ return;
+ }
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+ while (nrpages > 0) {
+ early_clear_fixmap(idx);
+ --idx;
+ --nrpages;
+ }
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+ WARN_ON(1);
+}
+
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
deleted file mode 100644
index 0b278315d73..00000000000
--- a/arch/x86/mm/ioremap_32.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * arch/i386/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <asm/fixmap.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
- void __iomem * addr;
- struct vm_struct * area;
- unsigned long offset, last_addr;
- pgprot_t prot;
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
- return NULL;
-
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
- return (void __iomem *) phys_to_virt(phys_addr);
-
- /*
- * Don't allow anybody to remap normal RAM that we're using..
- */
- if (phys_addr <= virt_to_phys(high_memory - 1)) {
- char *t_addr, *t_end;
- struct page *page;
-
- t_addr = __va(phys_addr);
- t_end = t_addr + (size - 1);
-
- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
- if(!PageReserved(page))
- return NULL;
- }
-
- prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
- | _PAGE_ACCESSED | flags);
-
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area(size, VM_IOREMAP | (flags << 20));
- if (!area)
- return NULL;
- area->phys_addr = phys_addr;
- addr = (void __iomem *) area->addr;
- if (ioremap_page_range((unsigned long) addr,
- (unsigned long) addr + size, phys_addr, prot)) {
- vunmap((void __force *) addr);
- return NULL;
- }
- return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache - map bus memory into CPU space
- * @offset: bus address of the memory
- * @size: size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address.
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- *
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
- unsigned long last_addr;
- void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
- if (!p)
- return p;
-
- /* Guaranteed to be > phys_addr, as per __ioremap() */
- last_addr = phys_addr + size - 1;
-
- if (last_addr < virt_to_phys(high_memory) - 1) {
- struct page *ppage = virt_to_page(__va(phys_addr));
- unsigned long npages;
-
- phys_addr &= PAGE_MASK;
-
- /* This might overflow and become zero.. */
- last_addr = PAGE_ALIGN(last_addr);
-
- /* .. but that's ok, because modulo-2**n arithmetic will make
- * the page-aligned "last - first" come out right.
- */
- npages = (last_addr - phys_addr) >> PAGE_SHIFT;
-
- if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
- iounmap(p);
- p = NULL;
- }
- global_flush_tlb();
- }
-
- return p;
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
- struct vm_struct *p, *o;
-
- if ((void __force *)addr <= high_memory)
- return;
-
- /*
- * __ioremap special-cases the PCI/ISA range by not instantiating a
- * vm_area and by simply returning an address into the kernel mapping
- * of ISA space. So handle that here.
- */
- if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
- addr < phys_to_virt(ISA_END_ADDRESS))
- return;
-
- addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-
- /* Use the vm area unlocked, assuming the caller
- ensures there isn't another iounmap for the same address
- in parallel. Reuse of the virtual address is prevented by
- leaving it in the global lists until we're done with it.
- cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
- break;
- }
- read_unlock(&vmlist_lock);
-
- if (!p) {
- printk("iounmap: bad address %p\n", addr);
- dump_stack();
- return;
- }
-
- /* Reset the direct mapping. Can block */
- if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
- change_page_attr(virt_to_page(__va(p->phys_addr)),
- get_vm_area_size(p) >> PAGE_SHIFT,
- PAGE_KERNEL);
- global_flush_tlb();
- }
-
- /* Finally remove it */
- o = remove_vm_area((void *)addr);
- BUG_ON(p != o || o == NULL);
- kfree(p);
-}
-EXPORT_SYMBOL(iounmap);
-
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
-{
- unsigned long offset, last_addr;
- unsigned int nrpages;
- enum fixed_addresses idx;
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
- return NULL;
-
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
- return phys_to_virt(phys_addr);
-
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr) - phys_addr;
-
- /*
- * Mappings have to fit in the FIX_BTMAP area.
- */
- nrpages = size >> PAGE_SHIFT;
- if (nrpages > NR_FIX_BTMAPS)
- return NULL;
-
- /*
- * Ok, go for it..
- */
- idx = FIX_BTMAP_BEGIN;
- while (nrpages > 0) {
- set_fixmap(idx, phys_addr);
- phys_addr += PAGE_SIZE;
- --idx;
- --nrpages;
- }
- return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
-}
-
-void __init bt_iounmap(void *addr, unsigned long size)
-{
- unsigned long virt_addr;
- unsigned long offset;
- unsigned int nrpages;
- enum fixed_addresses idx;
-
- virt_addr = (unsigned long)addr;
- if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
- return;
- offset = virt_addr & ~PAGE_MASK;
- nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
-
- idx = FIX_BTMAP_BEGIN;
- while (nrpages > 0) {
- clear_fixmap(idx);
- --idx;
- --nrpages;
- }
-}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
deleted file mode 100644
index 6cac90aa503..00000000000
--- a/arch/x86/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * arch/x86_64/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-
-#include <asm/pgalloc.h>
-#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/proto.h>
-
-unsigned long __phys_addr(unsigned long x)
-{
- if (x >= __START_KERNEL_map)
- return x - __START_KERNEL_map + phys_base;
- return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
-
-/*
- * Fix up the linear direct mapping of the kernel to avoid cache attribute
- * conflicts.
- */
-static int
-ioremap_change_attr(unsigned long phys_addr, unsigned long size,
- unsigned long flags)
-{
- int err = 0;
- if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
- unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long vaddr = (unsigned long) __va(phys_addr);
-
- /*
- * Must use a address here and not struct page because the phys addr
- * can be a in hole between nodes and not have an memmap entry.
- */
- err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
- if (!err)
- global_flush_tlb();
- }
- return err;
-}
-
-/*
- * Generic mapping function
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
- void * addr;
- struct vm_struct * area;
- unsigned long offset, last_addr;
- pgprot_t pgprot;
-
- /* Don't allow wraparound or zero size */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr)
- return NULL;
-
- /*
- * Don't remap the low PCI/ISA area, it's always mapped..
- */
- if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
- return (__force void __iomem *)phys_to_virt(phys_addr);
-
-#ifdef CONFIG_FLATMEM
- /*
- * Don't allow anybody to remap normal RAM that we're using..
- */
- if (last_addr < virt_to_phys(high_memory)) {
- char *t_addr, *t_end;
- struct page *page;
-
- t_addr = __va(phys_addr);
- t_end = t_addr + (size - 1);
-
- for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
- if(!PageReserved(page))
- return NULL;
- }
-#endif
-
- pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
- | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
- /*
- * Mappings have to be page-aligned
- */
- offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
- /*
- * Ok, go for it..
- */
- area = get_vm_area(size, VM_IOREMAP | (flags << 20));
- if (!area)
- return NULL;
- area->phys_addr = phys_addr;
- addr = area->addr;
- if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
- phys_addr, pgprot)) {
- remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
- return NULL;
- }
- if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
- area->flags &= 0xffffff;
- vunmap(addr);
- return NULL;
- }
- return (__force void __iomem *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache - map bus memory into CPU space
- * @offset: bus address of the memory
- * @size: size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address.
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- *
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
- return __ioremap(phys_addr, size, _PAGE_PCD);
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
- struct vm_struct *p, *o;
-
- if (addr <= high_memory)
- return;
- if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
- addr < phys_to_virt(ISA_END_ADDRESS))
- return;
-
- addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
- /* Use the vm area unlocked, assuming the caller
- ensures there isn't another iounmap for the same address
- in parallel. Reuse of the virtual address is prevented by
- leaving it in the global lists until we're done with it.
- cpa takes care of the direct mappings. */
- read_lock(&vmlist_lock);
- for (p = vmlist; p; p = p->next) {
- if (p->addr == addr)
- break;
- }
- read_unlock(&vmlist_lock);
-
- if (!p) {
- printk("iounmap: bad address %p\n", addr);
- dump_stack();
- return;
- }
-
- /* Reset the direct mapping. Can block */
- if (p->flags >> 20)
- ioremap_change_attr(p->phys_addr, p->size, 0);
-
- /* Finally remove it */
- o = remove_vm_area((void *)addr);
- BUG_ON(p != o || o == NULL);
- kfree(p);
-}
-EXPORT_SYMBOL(iounmap);
-
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index a96006f7ae0..7a2ebce87df 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -1,9 +1,9 @@
-/*
+/*
* AMD K8 NUMA support.
* Discover the memory map and associated nodes.
- *
+ *
* This version reads it directly from the K8 northbridge.
- *
+ *
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
@@ -22,132 +22,135 @@
static __init int find_northbridge(void)
{
- int num;
+ int num;
- for (num = 0; num < 32; num++) {
+ for (num = 0; num < 32; num++) {
u32 header;
-
- header = read_pci_config(0, num, 0, 0x00);
- if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
- continue;
-
- header = read_pci_config(0, num, 1, 0x00);
- if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
- continue;
- return num;
- }
-
- return -1;
+
+ header = read_pci_config(0, num, 0, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+ continue;
+
+ header = read_pci_config(0, num, 1, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+ continue;
+ return num;
+ }
+
+ return -1;
}
int __init k8_scan_nodes(unsigned long start, unsigned long end)
-{
+{
unsigned long prevbase;
struct bootnode nodes[8];
- int nodeid, i, j, nb;
+ int nodeid, i, nb;
unsigned char nodeids[8];
int found = 0;
u32 reg;
unsigned numnodes;
- unsigned num_cores;
+ unsigned cores;
+ unsigned bits;
+ int j;
if (!early_pci_allowed())
return -1;
- nb = find_northbridge();
- if (nb < 0)
+ nb = find_northbridge();
+ if (nb < 0)
return nb;
- printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
-
- num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
- printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+ printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
- reg = read_pci_config(0, nb, 0, 0x60);
+ reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
return -1;
printk(KERN_INFO "Number of nodes %d\n", numnodes);
- memset(&nodes,0,sizeof(nodes));
+ memset(&nodes, 0, sizeof(nodes));
prevbase = 0;
- for (i = 0; i < 8; i++) {
- unsigned long base,limit;
+ for (i = 0; i < 8; i++) {
+ unsigned long base, limit;
u32 nodeid;
-
+
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeid = limit & 7;
nodeids[i] = nodeid;
- if ((base & 3) == 0) {
+ if ((base & 3) == 0) {
if (i < numnodes)
- printk("Skipping disabled node %d\n", i);
+ printk("Skipping disabled node %d\n", i);
continue;
- }
+ }
if (nodeid >= numnodes) {
printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
- base, limit);
+ base, limit);
continue;
- }
+ }
- if (!limit) {
- printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
- base);
+ if (!limit) {
+ printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
+ i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
- printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
- nodeid, (base>>8)&3, (limit>>8) & 3);
- return -1;
- }
+ printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
+ nodeid, (base>>8)&3, (limit>>8) & 3);
+ return -1;
+ }
if (node_isset(nodeid, node_possible_map)) {
- printk(KERN_INFO "Node %d already present. Skipping\n",
+ printk(KERN_INFO "Node %d already present. Skipping\n",
nodeid);
continue;
}
- limit >>= 16;
- limit <<= 24;
+ limit >>= 16;
+ limit <<= 24;
limit |= (1<<24)-1;
limit++;
if (limit > end_pfn << PAGE_SHIFT)
limit = end_pfn << PAGE_SHIFT;
if (limit <= base)
- continue;
-
+ continue;
+
base >>= 16;
- base <<= 24;
-
- if (base < start)
- base = start;
- if (limit > end)
- limit = end;
- if (limit == base) {
- printk(KERN_ERR "Empty node %d\n", nodeid);
- continue;
+ base <<= 24;
+
+ if (base < start)
+ base = start;
+ if (limit > end)
+ limit = end;
+ if (limit == base) {
+ printk(KERN_ERR "Empty node %d\n", nodeid);
+ continue;
}
- if (limit < base) {
+ if (limit < base) {
printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
- nodeid, base, limit);
+ nodeid, base, limit);
continue;
- }
-
+ }
+
/* Could sort here, but pun for now. Should not happen anyroads. */
- if (prevbase > base) {
+ if (prevbase > base) {
printk(KERN_ERR "Node map not sorted %lx,%lx\n",
- prevbase,base);
+ prevbase, base);
return -1;
}
-
- printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
- nodeid, base, limit);
-
+
+ printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
+ nodeid, base, limit);
+
found++;
-
- nodes[nodeid].start = base;
+
+ nodes[nodeid].start = base;
nodes[nodeid].end = limit;
e820_register_active_regions(nodeid,
nodes[nodeid].start >> PAGE_SHIFT,
@@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
prevbase = base;
node_set(nodeid, node_possible_map);
- }
+ }
if (!found)
- return -1;
+ return -1;
memnode_shift = compute_hash_shift(nodes, 8);
- if (memnode_shift < 0) {
- printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
- return -1;
- }
- printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+ if (memnode_shift < 0) {
+ printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+ return -1;
+ }
+ printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+
+ /* use the coreid bits from early_identify_cpu */
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = (1<<bits);
for (i = 0; i < 8; i++) {
- if (nodes[i].start != nodes[i].end) {
+ if (nodes[i].start != nodes[i].end) {
nodeid = nodeids[i];
- for (j = 0; j < num_cores; j++)
- apicid_to_node[(nodeid * num_cores) + j] = i;
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- }
+ for (j = 0; j < cores; j++)
+ apicid_to_node[(nodeid << bits) + j] = i;
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
}
numa_init_array();
return 0;
-}
+}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c
index 552e0847375..56fe7124fbe 100644
--- a/arch/x86/mm/mmap_32.c
+++ b/arch/x86/mm/mmap.c
@@ -1,10 +1,13 @@
/*
- * linux/arch/i386/mm/mmap.c
+ * Flexible mmap layout support
*
- * flexible mmap layout support
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -19,14 +22,12 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
*/
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
+#include <linux/limits.h>
#include <linux/sched.h>
/*
@@ -37,20 +38,71 @@
#define MIN_GAP (128*1024*1024)
#define MAX_GAP (TASK_SIZE/6*5)
-static inline unsigned long mmap_base(struct mm_struct *mm)
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static int mmap_is_ia32(void)
{
- unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
- unsigned long random_factor = 0;
+#ifdef CONFIG_X86_32
+ return 1;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ if (test_thread_flag(TIF_IA32))
+ return 1;
+#endif
+ return 0;
+}
- if (current->flags & PF_RANDOMIZE)
- random_factor = get_random_int() % (1024*1024);
+static int mmap_is_legacy(void)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+static unsigned long mmap_rnd(void)
+{
+ unsigned long rnd = 0;
+
+ /*
+ * 8 bits of randomness in 32bit mmaps, 20 address space bits
+ * 28 bits of randomness in 64bit mmaps, 40 address space bits
+ */
+ if (current->flags & PF_RANDOMIZE) {
+ if (mmap_is_ia32())
+ rnd = (long)get_random_int() % (1<<8);
+ else
+ rnd = (long)(get_random_int() % (1<<28));
+ }
+ return rnd << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base(void)
+{
+ unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+ return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+}
+
+/*
+ * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
+ * does, but not when emulating X86_32
+ */
+static unsigned long mmap_legacy_base(void)
+{
+ if (mmap_is_ia32())
+ return TASK_UNMAPPED_BASE;
+ else
+ return TASK_UNMAPPED_BASE + mmap_rnd();
}
/*
@@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- /*
- * Fall back to the standard layout if the personality
- * bit is set, or if the expected stack growth is unlimited:
- */
- if (sysctl_legacy_va_layout ||
- (current->personality & ADDR_COMPAT_LAYOUT) ||
- current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
- mm->mmap_base = TASK_UNMAPPED_BASE;
+ if (mmap_is_legacy()) {
+ mm->mmap_base = mmap_legacy_base();
mm->get_unmapped_area = arch_get_unmapped_area;
mm->unmap_area = arch_unmap_area;
} else {
- mm->mmap_base = mmap_base(mm);
+ mm->mmap_base = mmap_base();
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
mm->unmap_area = arch_unmap_area_topdown;
}
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
deleted file mode 100644
index 80bba0dc000..00000000000
--- a/arch/x86/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2005 Andi Kleen, SuSE Labs.
- * Licensed under GPL, v.2
- */
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/random.h>
-#include <asm/ia32.h>
-
-/* Notebook: move the mmap code from sys_x86_64.c over here. */
-
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->flags & _TIF_IA32)
- return ia32_pick_mmap_layout(mm);
-#endif
- mm->mmap_base = TASK_UNMAPPED_BASE;
- if (current->flags & PF_RANDOMIZE) {
- /* Add 28bit randomness which is about 40bits of address space
- because mmap base has to be page aligned.
- or ~1/128 of the total user VM
- (total user address space is 47bits) */
- unsigned rnd = get_random_int() & 0xfffffff;
- mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
- }
- mm->get_unmapped_area = arch_get_unmapped_area;
- mm->unmap_area = arch_unmap_area;
-}
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d6926ba899..dc3b1f7e145 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
-/*
+/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
+ */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
@@ -11,35 +11,45 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
+#include <linux/sched.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/numa.h>
#include <asm/acpi.h>
+#include <asm/k8.h>
#ifndef Dprintk
#define Dprintk(x...)
#endif
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
-unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+int x86_cpu_to_node_map_init[NR_CPUS] = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+void *x86_cpu_to_node_map_early_ptr;
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+
+s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_to_cpumask_map);
int numa_off __initdata;
unsigned long __initdata nodemap_addr;
unsigned long __initdata nodemap_size;
-
/*
* Given a shift value, try to populate memnodemap[]
* Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
* 0 if memnodmap[] too small (of shift too small)
* -1 if node overlap or lost ram (shift too big)
*/
-static int __init
-populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+static int __init populate_memnodemap(const struct bootnode *nodes,
+ int numnodes, int shift)
{
- int i;
- int res = -1;
unsigned long addr, end;
+ int i, res = -1;
- memset(memnodemap, 0xff, memnodemapsize);
+ memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
for (i = 0; i < numnodes; i++) {
addr = nodes[i].start;
end = nodes[i].end;
@@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
if ((end >> shift) >= memnodemapsize)
return 0;
do {
- if (memnodemap[addr >> shift] != 0xff)
+ if (memnodemap[addr >> shift] != NUMA_NO_NODE)
return -1;
memnodemap[addr >> shift] = i;
addr += (1UL << shift);
} while (addr < end);
res = 1;
- }
+ }
return res;
}
@@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void)
unsigned long pad, pad_addr;
memnodemap = memnode.embedded_map;
- if (memnodemapsize <= 48)
+ if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
return 0;
pad = L1_CACHE_BYTES - 1;
pad_addr = 0x8000;
- nodemap_size = pad + memnodemapsize;
+ nodemap_size = pad + sizeof(s16) * memnodemapsize;
nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
nodemap_size);
if (nodemap_addr == -1UL) {
@@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void)
}
pad_addr = (nodemap_addr + pad) & ~pad;
memnodemap = phys_to_virt(pad_addr);
+ reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void)
* The LSB of all start and end addresses in the node map is the value of the
* maximum possible shift.
*/
-static int __init
-extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
+ int numnodes)
{
int i, nodes_used = 0;
unsigned long start, end;
@@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
shift);
if (populate_memnodemap(nodes, numnodes, shift) != 1) {
- printk(KERN_INFO
- "Your memory is not aligned you need to rebuild your kernel "
- "with a bigger NODEMAPSIZE shift=%d\n",
- shift);
+ printk(KERN_INFO "Your memory is not aligned you need to "
+ "rebuild your kernel with a bigger NODEMAPSIZE "
+ "shift=%d\n", shift);
return -1;
}
return shift;
}
-#ifdef CONFIG_SPARSEMEM
int early_pfn_to_nid(unsigned long pfn)
{
return phys_to_nid(pfn << PAGE_SHIFT);
}
-#endif
-static void * __init
-early_node_mem(int nodeid, unsigned long start, unsigned long end,
- unsigned long size)
+static void * __init early_node_mem(int nodeid, unsigned long start,
+ unsigned long end, unsigned long size)
{
unsigned long mem = find_e820_area(start, end, size);
void *ptr;
+
if (mem != -1L)
return __va(mem);
ptr = __alloc_bootmem_nopanic(size,
SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
if (ptr == NULL) {
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
- size, nodeid);
+ size, nodeid);
return NULL;
}
return ptr;
}
/* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{
- unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
- unsigned long nodedata_phys;
+void __init setup_node_bootmem(int nodeid, unsigned long start,
+ unsigned long end)
+{
+ unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+ unsigned long bootmap_start, nodedata_phys;
void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
- start = round_up(start, ZONE_ALIGN);
+ start = round_up(start, ZONE_ALIGN);
- printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+ printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+ start, end);
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
@@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
/* Find a place for the bootmem map */
- bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
+ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT);
if (bootmap == NULL) {
if (nodedata_phys < start || nodedata_phys >= end)
- free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+ free_bootmem((unsigned long)node_data[nodeid],
+ pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
- Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
-
+ Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
+
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
- bootmap_start >> PAGE_SHIFT,
- start_pfn, end_pfn);
+ bootmap_start >> PAGE_SHIFT,
+ start_pfn, end_pfn);
free_bootmem_with_active_regions(nodeid, end);
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+ reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
+ reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+ bootmap_pages<<PAGE_SHIFT);
#ifdef CONFIG_ACPI_NUMA
srat_reserve_add_area(nodeid);
#endif
node_set_online(nodeid);
-}
-
-/* Initialize final allocator for a zone */
-void __init setup_node_zones(int nodeid)
-{
- unsigned long start_pfn, end_pfn, memmapsize, limit;
-
- start_pfn = node_start_pfn(nodeid);
- end_pfn = node_end_pfn(nodeid);
-
- Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
- nodeid, start_pfn, end_pfn);
-
- /* Try to allocate mem_map at end to not fill up precious <4GB
- memory. */
- memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
- limit = end_pfn << PAGE_SHIFT;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- NODE_DATA(nodeid)->node_mem_map =
- __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
- memmapsize, SMP_CACHE_BYTES,
- round_down(limit - memmapsize, PAGE_SIZE),
- limit);
-#endif
-}
+}
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
void __init numa_init_array(void)
{
int rr, i;
- /* There are unfortunately some poorly designed mainboards around
- that only connect memory to a single CPU. This breaks the 1:1 cpu->node
- mapping. To avoid this fill in the mapping for all possible
- CPUs, as the number of CPUs is not known yet.
- We round robin the existing nodes. */
+
rr = first_node(node_online_map);
for (i = 0; i < NR_CPUS; i++) {
- if (cpu_to_node(i) != NUMA_NO_NODE)
+ if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
- numa_set_node(i, rr);
+ numa_set_node(i, rr);
rr = next_node(rr, node_online_map);
if (rr == MAX_NUMNODES)
rr = first_node(node_online_map);
}
-
}
#ifdef CONFIG_NUMA_EMU
@@ -276,15 +265,17 @@ void __init numa_init_array(void)
char *cmdline __initdata;
/*
- * Setups up nid to range from addr to addr + size. If the end boundary is
- * greater than max_addr, then max_addr is used instead. The return value is 0
- * if there is additional memory left for allocation past addr and -1 otherwise.
- * addr is adjusted to be at the end of the node.
+ * Setups up nid to range from addr to addr + size. If the end
+ * boundary is greater than max_addr, then max_addr is used instead.
+ * The return value is 0 if there is additional memory left for
+ * allocation past addr and -1 otherwise. addr is adjusted to be at
+ * the end of the node.
*/
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
u64 size, u64 max_addr)
{
int ret = 0;
+
nodes[nid].start = *addr;
*addr += size;
if (*addr >= max_addr) {
@@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
for (i = node_start; i < num_nodes + node_start; i++) {
u64 end = *addr + size;
+
if (i < big)
end += FAKE_NODE_MIN_SIZE;
/*
@@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
struct bootnode nodes[MAX_NUMNODES];
- u64 addr = start_pfn << PAGE_SHIFT;
+ u64 size, addr = start_pfn << PAGE_SHIFT;
u64 max_addr = end_pfn << PAGE_SHIFT;
- int num_nodes = 0;
- int coeff_flag;
- int coeff = -1;
- int num = 0;
- u64 size;
- int i;
+ int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
memset(&nodes, 0, sizeof(nodes));
/*
@@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
* system RAM into N fake nodes.
*/
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
- num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
- simple_strtol(cmdline, NULL, 0));
+ long n = simple_strtol(cmdline, NULL, 0);
+
+ num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
if (num_nodes < 0)
return num_nodes;
goto out;
@@ -483,46 +471,47 @@ out:
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
acpi_fake_nodes(nodes, num_nodes);
- numa_init_array();
- return 0;
+ numa_init_array();
+ return 0;
}
#endif /* CONFIG_NUMA_EMU */
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{
+{
int i;
nodes_clear(node_possible_map);
#ifdef CONFIG_NUMA_EMU
if (cmdline && !numa_emulation(start_pfn, end_pfn))
- return;
+ return;
nodes_clear(node_possible_map);
#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
end_pfn << PAGE_SHIFT))
- return;
+ return;
nodes_clear(node_possible_map);
#endif
#ifdef CONFIG_K8_NUMA
- if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+ if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
+ end_pfn<<PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
#endif
printk(KERN_INFO "%s\n",
numa_off ? "NUMA turned off" : "No NUMA configuration found");
- printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+ printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
start_pfn << PAGE_SHIFT,
- end_pfn << PAGE_SHIFT);
- /* setup dummy node covering all memory */
- memnode_shift = 63;
+ end_pfn << PAGE_SHIFT);
+ /* setup dummy node covering all memory */
+ memnode_shift = 63;
memnodemap = memnode.embedded_map;
memnodemap[0] = 0;
nodes_clear(node_online_map);
@@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
node_set(0, node_possible_map);
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
- node_to_cpumask[0] = cpumask_of_cpu(0);
+ /* cpumask_of_cpu() may not be available during early startup */
+ memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
+ cpu_set(0, node_to_cpumask_map[0]);
e820_register_active_regions(0, start_pfn, end_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
__cpuinit void numa_add_cpu(int cpu)
{
- set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-}
+ set_bit(cpu,
+ (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
void __cpuinit numa_set_node(int cpu, int node)
{
+ int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
+
cpu_pda(cpu)->nodenumber = node;
- cpu_to_node(cpu) = node;
+
+ if(cpu_to_node_map)
+ cpu_to_node_map[cpu] = node;
+ else if(per_cpu_offset(cpu))
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+ else
+ Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
}
-unsigned long __init numa_free_all_bootmem(void)
-{
- int i;
+unsigned long __init numa_free_all_bootmem(void)
+{
unsigned long pages = 0;
- for_each_online_node(i) {
+ int i;
+
+ for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
- }
+
return pages;
-}
+}
void __init paging_init(void)
-{
- int i;
+{
unsigned long max_zone_pfns[MAX_NR_ZONES];
+
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +569,27 @@ void __init paging_init(void)
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
- for_each_online_node(i) {
- setup_node_zones(i);
- }
-
free_area_init_nodes(max_zone_pfns);
-}
+}
static __init int numa_setup(char *opt)
-{
+{
if (!opt)
return -EINVAL;
- if (!strncmp(opt,"off",3))
+ if (!strncmp(opt, "off", 3))
numa_off = 1;
#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
cmdline = opt + 5;
#endif
#ifdef CONFIG_ACPI_NUMA
- if (!strncmp(opt,"noacpi",6))
- acpi_numa = -1;
- if (!strncmp(opt,"hotadd=", 7))
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+ if (!strncmp(opt, "hotadd=", 7))
hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 0;
-}
-
+}
early_param("numa", numa_setup);
/*
@@ -611,38 +607,16 @@ early_param("numa", numa_setup);
void __init init_cpu_to_node(void)
{
int i;
- for (i = 0; i < NR_CPUS; i++) {
- u8 apicid = x86_cpu_to_apicid_init[i];
+
+ for (i = 0; i < NR_CPUS; i++) {
+ u16 apicid = x86_cpu_to_apicid_init[i];
+
if (apicid == BAD_APICID)
continue;
if (apicid_to_node[apicid] == NUMA_NO_NODE)
continue;
- numa_set_node(i,apicid_to_node[apicid]);
+ numa_set_node(i, apicid_to_node[apicid]);
}
}
-EXPORT_SYMBOL(cpu_to_node);
-EXPORT_SYMBOL(node_to_cpumask);
-EXPORT_SYMBOL(memnode);
-EXPORT_SYMBOL(node_data);
-
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * Functions to convert PFNs from/to per node page addresses.
- * These are out of line because they are quite big.
- * They could be all tuned by pre caching more state.
- * Should do that.
- */
-int pfn_valid(unsigned long pfn)
-{
- unsigned nid;
- if (pfn >= num_physpages)
- return 0;
- nid = pfn_to_nid(pfn);
- if (nid == 0xff)
- return 0;
- return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 00000000000..06353d43f72
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,224 @@
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the global bit on random pages in the direct mapping, then reverts
+ * and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+enum {
+ NTEST = 4000,
+#ifdef CONFIG_X86_64
+ LPS = (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+ LPS = (1 << PMD_SHIFT),
+#else
+ LPS = (1 << 22),
+#endif
+ GPS = (1<<30)
+};
+
+struct split_state {
+ long lpg, gpg, spg, exec;
+ long min_exec, max_exec;
+};
+
+static __init int print_split(struct split_state *s)
+{
+ long i, expected, missed = 0;
+ int printed = 0;
+ int err = 0;
+
+ s->lpg = s->gpg = s->spg = s->exec = 0;
+ s->min_exec = ~0UL;
+ s->max_exec = 0;
+ for (i = 0; i < max_pfn_mapped; ) {
+ unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+ int level;
+ pte_t *pte;
+
+ pte = lookup_address(addr, &level);
+ if (!pte) {
+ if (!printed) {
+ dump_pagetable(addr);
+ printk(KERN_INFO "CPA %lx no pte level %d\n",
+ addr, level);
+ printed = 1;
+ }
+ missed++;
+ i++;
+ continue;
+ }
+
+ if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+ s->gpg++;
+ i += GPS/PAGE_SIZE;
+ } else if (level == PG_LEVEL_2M) {
+ if (!(pte_val(*pte) & _PAGE_PSE)) {
+ printk(KERN_ERR
+ "%lx level %d but not PSE %Lx\n",
+ addr, level, (u64)pte_val(*pte));
+ err = 1;
+ }
+ s->lpg++;
+ i += LPS/PAGE_SIZE;
+ } else {
+ s->spg++;
+ i++;
+ }
+ if (!(pte_val(*pte) & _PAGE_NX)) {
+ s->exec++;
+ if (addr < s->min_exec)
+ s->min_exec = addr;
+ if (addr > s->max_exec)
+ s->max_exec = addr;
+ }
+ }
+ printk(KERN_INFO
+ "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+ s->spg, s->lpg, s->gpg, s->exec,
+ s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
+
+ expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+ if (expected != i) {
+ printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+ max_pfn_mapped, expected);
+ return 1;
+ }
+ return err;
+}
+
+static unsigned long __initdata addr[NTEST];
+static unsigned int __initdata len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static __init int exercise_pageattr(void)
+{
+ struct split_state sa, sb, sc;
+ unsigned long *bm;
+ pte_t *pte, pte0;
+ int failed = 0;
+ int level;
+ int i, k;
+ int err;
+
+ printk(KERN_INFO "CPA exercising pageattr\n");
+
+ bm = vmalloc((max_pfn_mapped + 7) / 8);
+ if (!bm) {
+ printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+ return -ENOMEM;
+ }
+ memset(bm, 0, (max_pfn_mapped + 7) / 8);
+
+ failed += print_split(&sa);
+ srandom32(100);
+
+ for (i = 0; i < NTEST; i++) {
+ unsigned long pfn = random32() % max_pfn_mapped;
+
+ addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+ len[i] = random32() % 100;
+ len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+ if (len[i] == 0)
+ len[i] = 1;
+
+ pte = NULL;
+ pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+ for (k = 0; k < len[i]; k++) {
+ pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+ if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
+ addr[i] = 0;
+ break;
+ }
+ if (k == 0) {
+ pte0 = *pte;
+ } else {
+ if (pgprot_val(pte_pgprot(*pte)) !=
+ pgprot_val(pte_pgprot(pte0))) {
+ len[i] = k;
+ break;
+ }
+ }
+ if (test_bit(pfn + k, bm)) {
+ len[i] = k;
+ break;
+ }
+ __set_bit(pfn + k, bm);
+ }
+ if (!addr[i] || !pte || !k) {
+ addr[i] = 0;
+ continue;
+ }
+
+ err = change_page_attr_clear(addr[i], len[i],
+ __pgprot(_PAGE_GLOBAL));
+ if (err < 0) {
+ printk(KERN_ERR "CPA %d failed %d\n", i, err);
+ failed++;
+ }
+
+ pte = lookup_address(addr[i], &level);
+ if (!pte || pte_global(*pte) || pte_huge(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+ pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+ if (level != PG_LEVEL_4K) {
+ printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+ addr[i], level);
+ failed++;
+ }
+
+ }
+ vfree(bm);
+
+ failed += print_split(&sb);
+
+ printk(KERN_INFO "CPA reverting everything\n");
+ for (i = 0; i < NTEST; i++) {
+ if (!addr[i])
+ continue;
+ pte = lookup_address(addr[i], &level);
+ if (!pte) {
+ printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+ failed++;
+ continue;
+ }
+ err = change_page_attr_set(addr[i], len[i],
+ __pgprot(_PAGE_GLOBAL));
+ if (err < 0) {
+ printk(KERN_ERR "CPA reverting failed: %d\n", err);
+ failed++;
+ }
+ pte = lookup_address(addr[i], &level);
+ if (!pte || !pte_global(*pte)) {
+ printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+ addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+ failed++;
+ }
+
+ }
+
+ failed += print_split(&sc);
+
+ if (failed) {
+ printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
+ WARN_ON(1);
+ } else {
+ printk(KERN_INFO "CPA selftests PASSED\n");
+ }
+
+ return 0;
+}
+module_init(exercise_pageattr);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 00000000000..1cc6607eacb
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr: virtual start address
+ * @size: number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+ void *vend = vaddr + size - 1;
+
+ mb();
+
+ for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+ clflush(vaddr);
+ /*
+ * Flush any possible final partial cacheline:
+ */
+ clflush(vend);
+
+ mb();
+}
+
+static void __cpa_flush_all(void *arg)
+{
+ /*
+ * Flush all to work around Errata in early athlons regarding
+ * large page flushing.
+ */
+ __flush_tlb_all();
+
+ if (boot_cpu_data.x86_model >= 4)
+ wbinvd();
+}
+
+static void cpa_flush_all(void)
+{
+ BUG_ON(irqs_disabled());
+
+ on_each_cpu(__cpa_flush_all, NULL, 1, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+ /*
+ * We could optimize that further and do individual per page
+ * tlb invalidates for a low number of pages. Caveat: we must
+ * flush the high aliases on 64bit as well.
+ */
+ __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages)
+{
+ unsigned int i, level;
+ unsigned long addr;
+
+ BUG_ON(irqs_disabled());
+ WARN_ON(PAGE_ALIGN(start) != start);
+
+ on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+ /*
+ * We only need to flush on one CPU,
+ * clflush is a MESI-coherent instruction that
+ * will cause all other CPUs to flush the same
+ * cachelines:
+ */
+ for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+ pte_t *pte = lookup_address(addr, &level);
+
+ /*
+ * Only flush present addresses:
+ */
+ if (pte && pte_present(*pte))
+ clflush_cache_range((void *) addr, PAGE_SIZE);
+ }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
+{
+ pgprot_t forbidden = __pgprot(0);
+
+ /*
+ * The BIOS area between 640k and 1Mb needs to be executable for
+ * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+ */
+ if (within(__pa(address), BIOS_BEGIN, BIOS_END))
+ pgprot_val(forbidden) |= _PAGE_NX;
+
+ /*
+ * The kernel text needs to be executable for obvious reasons
+ * Does not cover __inittext since that is gone later on
+ */
+ if (within(address, (unsigned long)_text, (unsigned long)_etext))
+ pgprot_val(forbidden) |= _PAGE_NX;
+
+#ifdef CONFIG_DEBUG_RODATA
+ /* The .rodata section needs to be read-only */
+ if (within(address, (unsigned long)__start_rodata,
+ (unsigned long)__end_rodata))
+ pgprot_val(forbidden) |= _PAGE_RW;
+#endif
+
+ prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+ return prot;
+}
+
+pte_t *lookup_address(unsigned long address, int *level)
+{
+ pgd_t *pgd = pgd_offset_k(address);
+ pud_t *pud;
+ pmd_t *pmd;
+
+ *level = PG_LEVEL_NONE;
+
+ if (pgd_none(*pgd))
+ return NULL;
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud))
+ return NULL;
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ return NULL;
+
+ *level = PG_LEVEL_2M;
+ if (pmd_large(*pmd))
+ return (pte_t *)pmd;
+
+ *level = PG_LEVEL_4K;
+ return pte_offset_kernel(pmd, address);
+}
+
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+ /* change init_mm */
+ set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+ if (!SHARED_KERNEL_PMD) {
+ struct page *page;
+
+ list_for_each_entry(page, &pgd_list, lru) {
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ pud = pud_offset(pgd, address);
+ pmd = pmd_offset(pud, address);
+ set_pte_atomic((pte_t *)pmd, pte);
+ }
+ }
+#endif
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+ pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+ gfp_t gfp_flags = GFP_KERNEL;
+ unsigned long flags;
+ unsigned long addr;
+ pte_t *pbase, *tmp;
+ struct page *base;
+ unsigned int i, level;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
+ gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
+#endif
+ base = alloc_pages(gfp_flags, 0);
+ if (!base)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ /*
+ * Check for races, another CPU might have split this page
+ * up for us already:
+ */
+ tmp = lookup_address(address, &level);
+ if (tmp != kpte) {
+ WARN_ON_ONCE(1);
+ goto out_unlock;
+ }
+
+ address = __pa(address);
+ addr = address & LARGE_PAGE_MASK;
+ pbase = (pte_t *)page_address(base);
+#ifdef CONFIG_X86_32
+ paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+#endif
+
+ pgprot_val(ref_prot) &= ~_PAGE_NX;
+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
+ set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
+
+ /*
+ * Install the new, split up pagetable. Important detail here:
+ *
+ * On Intel the NX bit of all levels must be cleared to make a
+ * page executable. See section 4.13.2 of Intel 64 and IA-32
+ * Architectures Software Developer's Manual).
+ */
+ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+ __set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
+ base = NULL;
+
+out_unlock:
+ spin_unlock_irqrestore(&pgd_lock, flags);
+
+ if (base)
+ __free_pages(base, 0);
+
+ return 0;
+}
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn,
+ pgprot_t mask_set, pgprot_t mask_clr)
+{
+ struct page *kpte_page;
+ int level, err = 0;
+ pte_t *kpte;
+
+#ifdef CONFIG_X86_32
+ BUG_ON(pfn > max_low_pfn);
+#endif
+
+repeat:
+ kpte = lookup_address(address, &level);
+ if (!kpte)
+ return -EINVAL;
+
+ kpte_page = virt_to_page(kpte);
+ BUG_ON(PageLRU(kpte_page));
+ BUG_ON(PageCompound(kpte_page));
+
+ if (level == PG_LEVEL_4K) {
+ pgprot_t new_prot = pte_pgprot(*kpte);
+ pte_t new_pte, old_pte = *kpte;
+
+ pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
+ pgprot_val(new_prot) |= pgprot_val(mask_set);
+
+ new_prot = static_protections(new_prot, address);
+
+ new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+ BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
+
+ set_pte_atomic(kpte, new_pte);
+ } else {
+ err = split_large_page(kpte, address);
+ if (!err)
+ goto repeat;
+ }
+ return err;
+}
+
+/**
+ * change_page_attr_addr - Change page table attributes in linear mapping
+ * @address: Virtual address in linear mapping.
+ * @prot: New page table attribute (PAGE_*)
+ *
+ * Change page attributes of a page in the direct mapping. This is a variant
+ * of change_page_attr() that also works on memory holes that do not have
+ * mem_map entry (pfn_valid() is false).
+ *
+ * See change_page_attr() documentation for more details.
+ *
+ * Modules and drivers should use the set_memory_* APIs instead.
+ */
+
+#define HIGH_MAP_START __START_KERNEL_map
+#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE)
+
+static int
+change_page_attr_addr(unsigned long address, pgprot_t mask_set,
+ pgprot_t mask_clr)
+{
+ unsigned long phys_addr = __pa(address);
+ unsigned long pfn = phys_addr >> PAGE_SHIFT;
+ int err;
+
+#ifdef CONFIG_X86_64
+ /*
+ * If we are inside the high mapped kernel range, then we
+ * fixup the low mapping first. __va() returns the virtual
+ * address in the linear mapping:
+ */
+ if (within(address, HIGH_MAP_START, HIGH_MAP_END))
+ address = (unsigned long) __va(phys_addr);
+#endif
+
+ err = __change_page_attr(address, pfn, mask_set, mask_clr);
+ if (err)
+ return err;
+
+#ifdef CONFIG_X86_64
+ /*
+ * If the physical address is inside the kernel map, we need
+ * to touch the high mapped kernel as well:
+ */
+ if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
+ /*
+ * Calc the high mapping address. See __phys_addr()
+ * for the non obvious details.
+ */
+ address = phys_addr + HIGH_MAP_START - phys_base;
+ /* Make sure the kernel mappings stay executable */
+ pgprot_val(mask_clr) |= _PAGE_NX;
+
+ /*
+ * Our high aliases are imprecise, because we check
+ * everything between 0 and KERNEL_TEXT_SIZE, so do
+ * not propagate lookup failures back to users:
+ */
+ __change_page_attr(address, pfn, mask_set, mask_clr);
+ }
+#endif
+ return err;
+}
+
+static int __change_page_attr_set_clr(unsigned long addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr)
+{
+ unsigned int i;
+ int ret;
+
+ for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
+ ret = change_page_attr_addr(addr, mask_set, mask_clr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int change_page_attr_set_clr(unsigned long addr, int numpages,
+ pgprot_t mask_set, pgprot_t mask_clr)
+{
+ int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
+ mask_clr);
+
+ /*
+ * On success we use clflush, when the CPU supports it to
+ * avoid the wbindv. If the CPU does not support it and in the
+ * error case we fall back to cpa_flush_all (which uses
+ * wbindv):
+ */
+ if (!ret && cpu_has_clflush)
+ cpa_flush_range(addr, numpages);
+ else
+ cpa_flush_all();
+
+ return ret;
+}
+
+static inline int change_page_attr_set(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int change_page_attr_clear(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages,
+ __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+ return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return set_memory_rw(addr, numpages);
+}
+
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
+static inline int __change_page_attr_set(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int __change_page_attr_clear(unsigned long addr, int numpages,
+ pgprot_t mask)
+{
+ return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return __change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_PRESENT | _PAGE_RW));
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+
+ return __change_page_attr_clear(addr, numpages,
+ __pgprot(_PAGE_PRESENT));
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ if (PageHighMem(page))
+ return;
+ if (!enable) {
+ debug_check_no_locks_freed(page_address(page),
+ numpages * PAGE_SIZE);
+ }
+
+ /*
+ * If page allocator is not up yet then do not call c_p_a():
+ */
+ if (!debug_pagealloc_enabled)
+ return;
+
+ /*
+ * The return value is ignored - the calls cannot fail,
+ * large pages are disabled at boot time:
+ */
+ if (enable)
+ __set_pages_p(page, numpages);
+ else
+ __set_pages_np(page, numpages);
+
+ /*
+ * We should perform an IPI and flush all tlbs,
+ * but that can deadlock->flush only current cpu:
+ */
+ __flush_tlb_all();
+}
+#endif
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
deleted file mode 100644
index 260073c0760..00000000000
--- a/arch/x86/mm/pageattr_32.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Thanks to Ben LaHaise for precious feedback.
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-
-static DEFINE_SPINLOCK(cpa_lock);
-static struct list_head df_list = LIST_HEAD_INIT(df_list);
-
-
-pte_t *lookup_address(unsigned long address)
-{
- pgd_t *pgd = pgd_offset_k(address);
- pud_t *pud;
- pmd_t *pmd;
- if (pgd_none(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (pud_none(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return NULL;
- if (pmd_large(*pmd))
- return (pte_t *)pmd;
- return pte_offset_kernel(pmd, address);
-}
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
- pgprot_t ref_prot)
-{
- int i;
- unsigned long addr;
- struct page *base;
- pte_t *pbase;
-
- spin_unlock_irq(&cpa_lock);
- base = alloc_pages(GFP_KERNEL, 0);
- spin_lock_irq(&cpa_lock);
- if (!base)
- return NULL;
-
- /*
- * page_private is used to track the number of entries in
- * the page table page that have non standard attributes.
- */
- SetPagePrivate(base);
- page_private(base) = 0;
-
- address = __pa(address);
- addr = address & LARGE_PAGE_MASK;
- pbase = (pte_t *)page_address(base);
- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
- set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
- addr == address ? prot : ref_prot));
- }
- return base;
-}
-
-static void cache_flush_page(struct page *p)
-{
- void *adr = page_address(p);
- int i;
- for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
- clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
- struct list_head *lh = (struct list_head *)arg;
- struct page *p;
-
- /* High level code is not ready for clflush yet */
- if (0 && cpu_has_clflush) {
- list_for_each_entry (p, lh, lru)
- cache_flush_page(p);
- } else if (boot_cpu_data.x86_model >= 4)
- wbinvd();
-
- /* Flush all to work around Errata in early athlons regarding
- * large page flushing.
- */
- __flush_tlb_all();
-}
-
-static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
-{
- struct page *page;
- unsigned long flags;
-
- set_pte_atomic(kpte, pte); /* change init_mm */
- if (SHARED_KERNEL_PMD)
- return;
-
- spin_lock_irqsave(&pgd_lock, flags);
- for (page = pgd_list; page; page = (struct page *)page->index) {
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pud = pud_offset(pgd, address);
- pmd = pmd_offset(pud, address);
- set_pte_atomic((pte_t *)pmd, pte);
- }
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-/*
- * No more special protections in this 2/4MB area - revert to a
- * large page again.
- */
-static inline void revert_page(struct page *kpte_page, unsigned long address)
-{
- pgprot_t ref_prot;
- pte_t *linear;
-
- ref_prot =
- ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
- ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
- linear = (pte_t *)
- pmd_offset(pud_offset(pgd_offset_k(address), address), address);
- set_pmd_pte(linear, address,
- pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
- ref_prot));
-}
-
-static inline void save_page(struct page *kpte_page)
-{
- if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
- list_add(&kpte_page->lru, &df_list);
-}
-
-static int
-__change_page_attr(struct page *page, pgprot_t prot)
-{
- pte_t *kpte;
- unsigned long address;
- struct page *kpte_page;
-
- BUG_ON(PageHighMem(page));
- address = (unsigned long)page_address(page);
-
- kpte = lookup_address(address);
- if (!kpte)
- return -EINVAL;
- kpte_page = virt_to_page(kpte);
- BUG_ON(PageLRU(kpte_page));
- BUG_ON(PageCompound(kpte_page));
-
- if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
- if (!pte_huge(*kpte)) {
- set_pte_atomic(kpte, mk_pte(page, prot));
- } else {
- pgprot_t ref_prot;
- struct page *split;
-
- ref_prot =
- ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
- ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
- split = split_large_page(address, prot, ref_prot);
- if (!split)
- return -ENOMEM;
- set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
- kpte_page = split;
- }
- page_private(kpte_page)++;
- } else if (!pte_huge(*kpte)) {
- set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
- BUG_ON(page_private(kpte_page) == 0);
- page_private(kpte_page)--;
- } else
- BUG();
-
- /*
- * If the pte was reserved, it means it was created at boot
- * time (not via split_large_page) and in turn we must not
- * replace it with a largepage.
- */
-
- save_page(kpte_page);
- if (!PageReserved(kpte_page)) {
- if (cpu_has_pse && (page_private(kpte_page) == 0)) {
- paravirt_release_pt(page_to_pfn(kpte_page));
- revert_page(kpte_page, address);
- }
- }
- return 0;
-}
-
-static inline void flush_map(struct list_head *l)
-{
- on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- *
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- *
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
- int err = 0;
- int i;
- unsigned long flags;
-
- spin_lock_irqsave(&cpa_lock, flags);
- for (i = 0; i < numpages; i++, page++) {
- err = __change_page_attr(page, prot);
- if (err)
- break;
- }
- spin_unlock_irqrestore(&cpa_lock, flags);
- return err;
-}
-
-void global_flush_tlb(void)
-{
- struct list_head l;
- struct page *pg, *next;
-
- BUG_ON(irqs_disabled());
-
- spin_lock_irq(&cpa_lock);
- list_replace_init(&df_list, &l);
- spin_unlock_irq(&cpa_lock);
- flush_map(&l);
- list_for_each_entry_safe(pg, next, &l, lru) {
- list_del(&pg->lru);
- clear_bit(PG_arch_1, &pg->flags);
- if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
- continue;
- ClearPagePrivate(pg);
- __free_page(pg);
- }
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
- if (PageHighMem(page))
- return;
- if (!enable)
- debug_check_no_locks_freed(page_address(page),
- numpages * PAGE_SIZE);
-
- /* the return value is ignored - the calls cannot fail,
- * large pages are disabled at boot time.
- */
- change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
- /* we should perform an IPI and flush all tlbs,
- * but that can deadlock->flush only current cpu.
- */
- __flush_tlb_all();
-}
-#endif
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
deleted file mode 100644
index c40afbaaf93..00000000000
--- a/arch/x86/mm/pageattr_64.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Thanks to Ben LaHaise for precious feedback.
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-pte_t *lookup_address(unsigned long address)
-{
- pgd_t *pgd = pgd_offset_k(address);
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- if (pgd_none(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return NULL;
- if (pmd_large(*pmd))
- return (pte_t *)pmd;
- pte = pte_offset_kernel(pmd, address);
- if (pte && !pte_present(*pte))
- pte = NULL;
- return pte;
-}
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
- pgprot_t ref_prot)
-{
- int i;
- unsigned long addr;
- struct page *base = alloc_pages(GFP_KERNEL, 0);
- pte_t *pbase;
- if (!base)
- return NULL;
- /*
- * page_private is used to track the number of entries in
- * the page table page have non standard attributes.
- */
- SetPagePrivate(base);
- page_private(base) = 0;
-
- address = __pa(address);
- addr = address & LARGE_PAGE_MASK;
- pbase = (pte_t *)page_address(base);
- for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
- pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
- addr == address ? prot : ref_prot);
- }
- return base;
-}
-
-void clflush_cache_range(void *adr, int size)
-{
- int i;
- for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
- clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
- struct list_head *l = (struct list_head *)arg;
- struct page *pg;
-
- /* When clflush is available always use it because it is
- much cheaper than WBINVD. */
- /* clflush is still broken. Disable for now. */
- if (1 || !cpu_has_clflush)
- asm volatile("wbinvd" ::: "memory");
- else list_for_each_entry(pg, l, lru) {
- void *adr = page_address(pg);
- clflush_cache_range(adr, PAGE_SIZE);
- }
- __flush_tlb_all();
-}
-
-static inline void flush_map(struct list_head *l)
-{
- on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
-
-static inline void save_page(struct page *fpage)
-{
- if (!test_and_set_bit(PG_arch_1, &fpage->flags))
- list_add(&fpage->lru, &deferred_pages);
-}
-
-/*
- * No more special protections in this 2/4MB area - revert to a
- * large page again.
- */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t large_pte;
- unsigned long pfn;
-
- pgd = pgd_offset_k(address);
- BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd,address);
- BUG_ON(pud_none(*pud));
- pmd = pmd_offset(pud, address);
- BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
- pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
- large_pte = pfn_pte(pfn, ref_prot);
- large_pte = pte_mkhuge(large_pte);
- set_pte((pte_t *)pmd, large_pte);
-}
-
-static int
-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
- pgprot_t ref_prot)
-{
- pte_t *kpte;
- struct page *kpte_page;
- pgprot_t ref_prot2;
-
- kpte = lookup_address(address);
- if (!kpte) return 0;
- kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
- BUG_ON(PageLRU(kpte_page));
- BUG_ON(PageCompound(kpte_page));
- if (pgprot_val(prot) != pgprot_val(ref_prot)) {
- if (!pte_huge(*kpte)) {
- set_pte(kpte, pfn_pte(pfn, prot));
- } else {
- /*
- * split_large_page will take the reference for this
- * change_page_attr on the split page.
- */
- struct page *split;
- ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
- split = split_large_page(address, prot, ref_prot2);
- if (!split)
- return -ENOMEM;
- pgprot_val(ref_prot2) &= ~_PAGE_NX;
- set_pte(kpte, mk_pte(split, ref_prot2));
- kpte_page = split;
- }
- page_private(kpte_page)++;
- } else if (!pte_huge(*kpte)) {
- set_pte(kpte, pfn_pte(pfn, ref_prot));
- BUG_ON(page_private(kpte_page) == 0);
- page_private(kpte_page)--;
- } else
- BUG();
-
- /* on x86-64 the direct mapping set at boot is not using 4k pages */
- BUG_ON(PageReserved(kpte_page));
-
- save_page(kpte_page);
- if (page_private(kpte_page) == 0)
- revert_page(address, ref_prot);
- return 0;
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- *
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- *
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
-{
- int err = 0, kernel_map = 0;
- int i;
-
- if (address >= __START_KERNEL_map
- && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
- address = (unsigned long)__va(__pa(address));
- kernel_map = 1;
- }
-
- down_write(&init_mm.mmap_sem);
- for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
- unsigned long pfn = __pa(address) >> PAGE_SHIFT;
-
- if (!kernel_map || pte_present(pfn_pte(0, prot))) {
- err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
- if (err)
- break;
- }
- /* Handle kernel mapping too which aliases part of the
- * lowmem */
- if (__pa(address) < KERNEL_TEXT_SIZE) {
- unsigned long addr2;
- pgprot_t prot2;
- addr2 = __START_KERNEL_map + __pa(address);
- /* Make sure the kernel mappings stay executable */
- prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
- err = __change_page_attr(addr2, pfn, prot2,
- PAGE_KERNEL_EXEC);
- }
- }
- up_write(&init_mm.mmap_sem);
- return err;
-}
-
-/* Don't call this for MMIO areas that may not have a mem_map entry */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
- unsigned long addr = (unsigned long)page_address(page);
- return change_page_attr_addr(addr, numpages, prot);
-}
-
-void global_flush_tlb(void)
-{
- struct page *pg, *next;
- struct list_head l;
-
- /*
- * Write-protect the semaphore, to exclude two contexts
- * doing a list_replace_init() call in parallel and to
- * exclude new additions to the deferred_pages list:
- */
- down_write(&init_mm.mmap_sem);
- list_replace_init(&deferred_pages, &l);
- up_write(&init_mm.mmap_sem);
-
- flush_map(&l);
-
- list_for_each_entry_safe(pg, next, &l, lru) {
- list_del(&pg->lru);
- clear_bit(PG_arch_1, &pg->flags);
- if (page_private(pg) != 0)
- continue;
- ClearPagePrivate(pg);
- __free_page(pg);
- }
-}
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index be61a1d845a..2ae5999a795 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
return pte;
}
-void pmd_ctor(struct kmem_cache *cache, void *pmd)
-{
- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
@@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd)
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
-DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
-
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
- page->index = (unsigned long)pgd_list;
- if (pgd_list)
- set_page_private(pgd_list, (unsigned long)&page->index);
- pgd_list = page;
- set_page_private(page, (unsigned long)&pgd_list);
+
+ list_add(&page->lru, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
- struct page *next, **pprev, *page = virt_to_page(pgd);
- next = (struct page *)page->index;
- pprev = (struct page **)page_private(page);
- *pprev = next;
- if (next)
- set_page_private(next, (unsigned long)pprev);
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
}
@@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd)
if (SHARED_KERNEL_PMD)
return;
- paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
@@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd)
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-/* If we allocate a pmd for part of the kernel address space, then
- make sure its initialized with the appropriate kernel mappings.
- Otherwise use a cached zeroed pmd. */
-static pmd_t *pmd_cache_alloc(int idx)
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(pgd_t *pgdp)
{
- pmd_t *pmd;
+ int i;
- if (idx >= USER_PTRS_PER_PGD) {
- pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ pgd_t pgd = pgdp[i];
- if (pmd)
- memcpy(pmd,
- (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = native_make_pgd(0);
+
+ paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(pmd);
+ }
+ }
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ pud_t *pud;
+ unsigned long addr;
+ int i;
+
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+ if (!pmd) {
+ pgd_mop_up_pmds(pgd);
+ return 0;
+ }
+
+ if (i >= USER_PTRS_PER_PGD)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
- } else
- pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
- return pmd;
+ pud_populate(mm, pud, pmd);
+ }
+
+ return 1;
+}
+#else /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ return 1;
}
-static void pmd_cache_free(pmd_t *pmd, int idx)
+static void pgd_mop_up_pmds(pgd_t *pgd)
{
- if (idx >= USER_PTRS_PER_PGD)
- free_page((unsigned long)pmd);
- else
- kmem_cache_free(pmd_cache, pmd);
}
+#endif /* CONFIG_X86_PAE */
pgd_t *pgd_alloc(struct mm_struct *mm)
{
- int i;
pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
- if (PTRS_PER_PMD == 1 || !pgd)
- return pgd;
+ mm->pgd = pgd; /* so that alloc_pd can use it */
- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
- pmd_t *pmd = pmd_cache_alloc(i);
-
- if (!pmd)
- goto out_oom;
-
- paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
- set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+ quicklist_free(0, pgd_dtor, pgd);
+ pgd = NULL;
}
- return pgd;
-out_oom:
- for (i--; i >= 0; i--) {
- pgd_t pgdent = pgd[i];
- void* pmd = (void *)__va(pgd_val(pgdent)-1);
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- pmd_cache_free(pmd, i);
- }
- quicklist_free(0, pgd_dtor, pgd);
- return NULL;
+ return pgd;
}
void pgd_free(pgd_t *pgd)
{
- int i;
-
- /* in the PAE case user pgd entries are overwritten before usage */
- if (PTRS_PER_PMD > 1)
- for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
- pgd_t pgdent = pgd[i];
- void* pmd = (void *)__va(pgd_val(pgdent)-1);
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- pmd_cache_free(pmd, i);
- }
- /* in the non-PAE case, free_pgtables() clears user pgd entries */
+ pgd_mop_up_pmds(pgd);
quicklist_free(0, pgd_dtor, pgd);
}
@@ -372,4 +376,3 @@ void check_pgt_cache(void)
{
quicklist_trim(0, pgd_dtor, 25, 16);
}
-
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index ea85172fc0c..65416f843e5 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
{
int pxm, node;
+ int apic_id;
+
+ apic_id = pa->apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
bad_srat();
return;
}
- apicid_to_node[pa->apic_id] = node;
+ apicid_to_node[apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
- pxm, pa->apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Protect against too large hotadd areas that would fill up memory.
- */
-static int hotadd_enough_memory(struct bootnode *nd)
-{
- static unsigned long allocated;
- static unsigned long last_area_end;
- unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
- long mem = pages * sizeof(struct page);
- unsigned long addr;
- unsigned long allowed;
- unsigned long oldpages = pages;
-
- if (mem < 0)
- return 0;
- allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
- allowed = (allowed / 100) * hotadd_percent;
- if (allocated + mem > allowed) {
- unsigned long range;
- /* Give them at least part of their hotadd memory upto hotadd_percent
- It would be better to spread the limit out
- over multiple hotplug areas, but that is too complicated
- right now */
- if (allocated >= allowed)
- return 0;
- range = allowed - allocated;
- pages = (range / PAGE_SIZE);
- mem = pages * sizeof(struct page);
- nd->end = nd->start + range;
- }
- /* Not completely fool proof, but a good sanity check */
- addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
- if (addr == -1UL)
- return 0;
- if (pages != oldpages)
- printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
- pages << PAGE_SHIFT);
- last_area_end = addr + mem;
- allocated += mem;
- return 1;
-}
-
-static int update_end_of_memory(unsigned long end)
-{
- found_add_area = 1;
- if ((end >> PAGE_SHIFT) > end_pfn)
- end_pfn = end >> PAGE_SHIFT;
- return 1;
+ pxm, apic_id, node);
}
-static inline int save_add_info(void)
-{
- return hotadd_percent > 0;
-}
-#else
int update_end_of_memory(unsigned long end) {return -1;}
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
-#endif
/*
* Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add infomation.
+ * Both SPARSE and RESERVE need nodes_add information.
* This code supports one contiguous hot add area per node.
*/
static int reserve_hotadd(int node, unsigned long start, unsigned long end)
@@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
return 1;
}
-static void unparse_node(int node)
+static void __init unparse_node(int node)
{
int i;
node_clear(node, nodes_parsed);
@@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
cutoff_node(i, start, end);
- if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+ /*
+ * don't confuse VM with a node that doesn't have the
+ * minimum memory.
+ */
+ if (nodes[i].end &&
+ (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
unparse_node(i);
node_set_offline(i);
}
@@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
for (i = 0; i < NR_CPUS; i++) {
- if (cpu_to_node(i) == NUMA_NO_NODE)
+ int node = early_cpu_to_node(i);
+
+ if (node == NUMA_NO_NODE)
continue;
- if (!node_isset(cpu_to_node(i), node_possible_map))
+ if (!node_isset(node, node_possible_map))
numa_set_node(i, NUMA_NO_NODE);
}
numa_init_array();
@@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
}
#ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
static int __init find_node_by_addr(unsigned long addr)
{
int ret = NUMA_NO_NODE;
@@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr)
break;
}
}
- return i;
+ return ret;
}
/*
@@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr)
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- int fake_node_to_pxm_map[MAX_NUMNODES] = {
- [0 ... MAX_NUMNODES-1] = PXM_INVAL
- };
- unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
- };
printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
"topology.\n");