27 files changed, 3286 insertions, 3330 deletions
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
index 362b4ad082d..c36ae88bb54 100644
--- a/arch/x86/mm/Makefile_32
+++ b/arch/x86/mm/Makefile_32
@@ -2,9 +2,8 @@
 # Makefile for the linux i386-specific parts of the memory manager.
 #
 
-obj-y	:= init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o
+obj-y	:= init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
 
 obj-$(CONFIG_NUMA) += discontig_32.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
index 6bcb47945b8..688c8c28ac8 100644
--- a/arch/x86/mm/Makefile_64
+++ b/arch/x86/mm/Makefile_64
@@ -2,9 +2,8 @@
 # Makefile for the linux x86_64-specific parts of the memory manager.
 #
 
-obj-y	 := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o
+obj-y	 := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NUMA) += numa_64.o
 obj-$(CONFIG_K8_NUMA) += k8topology_64.o
 obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c
deleted file mode 100644
index f14da2a53ec..00000000000
--- a/arch/x86/mm/boot_ioremap_32.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * arch/i386/mm/boot_ioremap.c
- * 
- * Re-map functions for early boot-time before paging_init() when the 
- * boot-time pagetables are still in use
- *
- * Written by Dave Hansen <haveblue@us.ibm.com>
- */
-
-
-/*
- * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
- * keeps that from happening.  If anyone has a better way, I'm listening.
- *
- * boot_pte_t is defined only if this all works correctly
- */
-
-#undef CONFIG_X86_PAE
-#undef CONFIG_PARAVIRT
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include <linux/init.h>
-#include <linux/stddef.h>
-
-/* 
- * I'm cheating here.  It is known that the two boot PTE pages are 
- * allocated next to each other.  I'm pretending that they're just
- * one big array. 
- */
-
-#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
-
-static unsigned long boot_pte_index(unsigned long vaddr) 
-{
-	return __pa(vaddr) >> PAGE_SHIFT;
-}
-
-static inline boot_pte_t* boot_vaddr_to_pte(void *address)
-{
-	boot_pte_t* boot_pg = (boot_pte_t*)pg0;
-	return &boot_pg[boot_pte_index((unsigned long)address)];
-}
-
-/*
- * This is only for a caller who is clever enough to page-align
- * phys_addr and virtual_source, and who also has a preference
- * about which virtual address from which to steal ptes
- */
-static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
-		    void* virtual_source)
-{
-	boot_pte_t* pte;
-	int i;
-	char *vaddr = virtual_source;
-
-	pte = boot_vaddr_to_pte(virtual_source);
-	for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
-		set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
-		__flush_tlb_one(&vaddr[i*PAGE_SIZE]);
-	}
-}
-
-/* the virtual space we're going to remap comes from this array */
-#define BOOT_IOREMAP_PAGES 4
-#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
-static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE]
-		       __attribute__ ((aligned (PAGE_SIZE)));
-
-/*
- * This only applies to things which need to ioremap before paging_init()
- * bt_ioremap() and plain ioremap() are both useless at this point.
- * 
- * When used, we're still using the boot-time pagetables, which only
- * have 2 PTE pages mapping the first 8MB
- *
- * There is no unmap.  The boot-time PTE pages aren't used after boot.
- * If you really want the space back, just remap it yourself.
- * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
- */
-__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
-{
-	unsigned long last_addr, offset;
-	unsigned int nrpages;
-	
-	last_addr = phys_addr + size - 1;
-
-	/* page align the requested address */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr) - phys_addr;
-	
-	nrpages = size >> PAGE_SHIFT;
-	if (nrpages > BOOT_IOREMAP_PAGES)
-		return NULL;
-	
-	__boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
-
-	return &boot_ioremap_space[offset];
-}
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 13a474d3c6e..04b1d20e261 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -32,6 +32,7 @@
 #include <linux/kexec.h>
 #include <linux/pfn.h>
 #include <linux/swap.h>
+#include <linux/acpi.h>
 
 #include <asm/e820.h>
 #include <asm/setup.h>
@@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
 
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 unsigned long node_remap_size[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
 static unsigned long kva_start_pfn;
 static unsigned long kva_pages;
 /*
@@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid)
 	}
 }
 
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * In the discontig memory model, a portion of the kernel virtual area (KVA)
+ * is reserved and portions of nodes are mapped using it. This is to allow
+ * node-local memory to be allocated for structures that would normally require
+ * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers
+ * should be prepared to allocate from the bootmem allocator instead. This KVA
+ * mechanism is incompatible with SPARSEMEM as it makes assumptions about the
+ * layout of memory that are broken if alloc_remap() succeeds for some of the
+ * map and fails for others
+ */
+static unsigned long node_remap_start_pfn[MAX_NUMNODES];
+static void *node_remap_end_vaddr[MAX_NUMNODES];
+static void *node_remap_alloc_vaddr[MAX_NUMNODES];
+static unsigned long node_remap_offset[MAX_NUMNODES];
+
 void *alloc_remap(int nid, unsigned long size)
 {
 	void *allocation = node_remap_alloc_vaddr[nid];
@@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void)
 	return reserve_pages;
 }
 
+static void init_remap_allocator(int nid)
+{
+	node_remap_start_vaddr[nid] = pfn_to_kaddr(
+			kva_start_pfn + node_remap_offset[nid]);
+	node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+		(node_remap_size[nid] * PAGE_SIZE);
+	node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+		ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+	printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
+		(ulong) node_remap_start_vaddr[nid],
+		(ulong) pfn_to_kaddr(highstart_pfn
+		   + node_remap_offset[nid] + node_remap_size[nid]));
+}
+#else
+void *alloc_remap(int nid, unsigned long size)
+{
+	return NULL;
+}
+
+static unsigned long calculate_numa_remap_pages(void)
+{
+	return 0;
+}
+
+static void init_remap_allocator(int nid)
+{
+}
+
+void __init remap_numa_kva(void)
+{
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
 extern void setup_bootmem_allocator(void);
 unsigned long __init setup_memory(void)
 {
 	int nid;
 	unsigned long system_start_pfn, system_max_low_pfn;
+	unsigned long wasted_pages;
 
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -288,11 +336,18 @@ unsigned long __init setup_memory(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Numa kva area is below the initrd */
-	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image)
-		kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image)
+	if (initrd_start)
+		kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET)
 			- kva_pages;
 #endif
-	kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1);
+
+	/*
+	 * We waste pages past at the end of the KVA for no good reason other
+	 * than how it is located. This is bad.
+	 */
+	wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
+	kva_start_pfn -= wasted_pages;
+	kva_pages += wasted_pages;
 
 	system_max_low_pfn = max_low_pfn = find_max_low_pfn();
 	printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
@@ -318,19 +373,9 @@ unsigned long __init setup_memory(void)
 	printk("Low memory ends at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid) {
-		node_remap_start_vaddr[nid] = pfn_to_kaddr(
-				kva_start_pfn + node_remap_offset[nid]);
-		/* Init the node remap allocator */
-		node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-			(node_remap_size[nid] * PAGE_SIZE);
-		node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-			ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+		init_remap_allocator(nid);
 
 		allocate_pgdat(nid);
-		printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
-			(ulong) node_remap_start_vaddr[nid],
-			(ulong) pfn_to_kaddr(highstart_pfn
-			   + node_remap_offset[nid] + node_remap_size[nid]));
 	}
 	printk("High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
@@ -345,7 +390,8 @@ unsigned long __init setup_memory(void)
 
 void __init numa_kva_reserve(void)
 {
-	reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages));
+	if (kva_pages)
+		reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages));
 }
 
 void __init zone_sizes_init(void)
@@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr)
 
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
+
+#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT
+/*
+ * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA
+ *
+ * These stub functions are needed to compile 32-bit NUMA when SRAT is
+ * not set. There are functions in srat_64.c for parsing this table
+ * and it may be possible to make them common functions.
+ */
+void acpi_numa_slit_init (struct acpi_table_slit *slit)
+{
+	printk(KERN_INFO "ACPI: No support for parsing SLIT table\n");
+}
+
+void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa)
+{
+}
+
+void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
+#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 00000000000..7e8db53528a
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,62 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+
+
+int fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+
+#ifdef CONFIG_PNPBIOS
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+		extern u32 pnp_bios_is_utter_crap;
+		pnp_bios_is_utter_crap = 1;
+		printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+		__asm__ volatile(
+			"movl %0, %%esp\n\t"
+			"jmp *%1\n\t"
+			: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+		panic("do_trap: can't hit this");
+	}
+#endif
+
+	fixup = search_exception_tables(regs->ip);
+	if (fixup) {
+		regs->ip = fixup->fixup;
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Need to defined our own search_extable on X86_64 to work around
+ * a B stepping K8 bug.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	/* B stepping K8 bug */
+	if ((value >> 32) == 0)
+		value |= 0xffffffffUL << 32;
+
+	while (first <= last) {
+		const struct exception_table_entry *mid;
+		long diff;
+
+		mid = (last - first) / 2 + first;
+		diff = mid->insn - value;
+		if (diff == 0)
+			return mid;
+		else if (diff < 0)
+			first = mid+1;
+		else
+			last = mid-1;
+	}
+	return NULL;
+}
+#endif
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c
deleted file mode 100644
index 0ce4f22a263..00000000000
--- a/arch/x86/mm/extable_32.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/i386/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <asm/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-	const struct exception_table_entry *fixup;
-
-#ifdef CONFIG_PNPBIOS
-	if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs)))
-	{
-		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
-		extern u32 pnp_bios_is_utter_crap;
-		pnp_bios_is_utter_crap = 1;
-		printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
-		__asm__ volatile(
-			"movl %0, %%esp\n\t"
-			"jmp *%1\n\t"
-			: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
-		panic("do_trap: can't hit this");
-	}
-#endif
-
-	fixup = search_exception_tables(regs->eip);
-	if (fixup) {
-		regs->eip = fixup->fixup;
-		return 1;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c
deleted file mode 100644
index 79ac6e7100a..00000000000
--- a/arch/x86/mm/extable_64.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * linux/arch/x86_64/mm/extable.c
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-
-/* Simple binary search */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
-	       unsigned long value)
-{
-	/* Work around a B stepping K8 bug */
-	if ((value >> 32) == 0)
-		value |= 0xffffffffUL << 32; 
-
-        while (first <= last) {
-		const struct exception_table_entry *mid;
-		long diff;
-
-		mid = (last - first) / 2 + first;
-		diff = mid->insn - value;
-                if (diff == 0)
-                        return mid;
-                else if (diff < 0)
-                        first = mid+1;
-                else
-                        last = mid-1;
-        }
-        return NULL;
-}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 00000000000..e28cc5277b1
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,986 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/compiler.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>		/* for max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+
+#include <asm/system.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/pgalloc.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+#include <asm/proto.h>
+#include <asm-generic/sections.h>
+
+/*
+ * Page fault error code bits
+ *	bit 0 == 0 means no page found, 1 means protection fault
+ *	bit 1 == 0 means read, 1 means write
+ *	bit 2 == 0 means kernel, 1 means user-mode
+ *	bit 3 == 1 means use of reserved bit detected
+ *	bit 4 == 1 means fault was an instruction fetch
+ */
+#define PF_PROT		(1<<0)
+#define PF_WRITE	(1<<1)
+#define PF_USER		(1<<2)
+#define PF_RSVD		(1<<3)
+#define PF_INSTR	(1<<4)
+
+static inline int notify_page_fault(struct pt_regs *regs)
+{
+#ifdef CONFIG_KPROBES
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+#ifdef CONFIG_X86_32
+	if (!user_mode_vm(regs)) {
+#else
+	if (!user_mode(regs)) {
+#endif
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, 14))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+/*
+ * X86_32
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * X86_64
+ * Sometimes the CPU reports invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner
+ */
+static int is_prefetch(struct pt_regs *regs, unsigned long addr,
+		       unsigned long error_code)
+{
+	unsigned char *instr;
+	int scan_more = 1;
+	int prefetch = 0;
+	unsigned char *max_instr;
+
+#ifdef CONFIG_X86_32
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return 0;
+#endif
+
+	/* If it was a exec fault on NX page, ignore */
+	if (error_code & PF_INSTR)
+		return 0;
+
+	instr = (unsigned char *)convert_ip_to_linear(current, regs);
+	max_instr = instr + 15;
+
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+		return 0;
+
+	while (scan_more && instr < max_instr) {
+		unsigned char opcode;
+		unsigned char instr_hi;
+		unsigned char instr_lo;
+
+		if (probe_kernel_address(instr, opcode))
+			break;
+
+		instr_hi = opcode & 0xf0;
+		instr_lo = opcode & 0x0f;
+		instr++;
+
+		switch (instr_hi) {
+		case 0x20:
+		case 0x30:
+			/*
+			 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+			 * In X86_64 long mode, the CPU will signal invalid
+			 * opcode if some of these prefixes are present so
+			 * X86_64 will never get here anyway
+			 */
+			scan_more = ((instr_lo & 7) == 0x6);
+			break;
+#ifdef CONFIG_X86_64
+		case 0x40:
+			/*
+			 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+			 * Need to figure out under what instruction mode the
+			 * instruction was issued. Could check the LDT for lm,
+			 * but for now it's good enough to assume that long
+			 * mode only uses well known segments or kernel.
+			 */
+			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
+			break;
+#endif
+		case 0x60:
+			/* 0x64 thru 0x67 are valid prefixes in all modes. */
+			scan_more = (instr_lo & 0xC) == 0x4;
+			break;
+		case 0xF0:
+			/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+			scan_more = !instr_lo || (instr_lo>>1) == 1;
+			break;
+		case 0x00:
+			/* Prefetch instruction is 0x0F0D or 0x0F18 */
+			scan_more = 0;
+
+			if (probe_kernel_address(instr, opcode))
+				break;
+			prefetch = (instr_lo == 0xF) &&
+				(opcode == 0x0D || opcode == 0x18);
+			break;
+		default:
+			scan_more = 0;
+			break;
+		}
+	}
+	return prefetch;
+}
+
+static void force_sig_info_fault(int si_signo, int si_code,
+	unsigned long address, struct task_struct *tsk)
+{
+	siginfo_t info;
+
+	info.si_signo = si_signo;
+	info.si_errno = 0;
+	info.si_code = si_code;
+	info.si_addr = (void __user *)address;
+	force_sig_info(si_signo, &info, tsk);
+}
+
+#ifdef CONFIG_X86_64
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+#endif
+
+void dump_pagetable(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	__typeof__(pte_val(__pte(0))) page;
+
+	page = read_cr3();
+	page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
+#ifdef CONFIG_X86_PAE
+	printk("*pdpt = %016Lx ", page);
+	if ((page >> PAGE_SHIFT) < max_low_pfn
+	    && page & _PAGE_PRESENT) {
+		page &= PAGE_MASK;
+		page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
+		                                         & (PTRS_PER_PMD - 1)];
+		printk(KERN_CONT "*pde = %016Lx ", page);
+		page &= ~_PAGE_NX;
+	}
+#else
+	printk("*pde = %08lx ", page);
+#endif
+
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case if the page table is located in highmem.
+	 * And let's rather not kmap-atomic the pte, just in case
+	 * it's allocated already.
+	 */
+	if ((page >> PAGE_SHIFT) < max_low_pfn
+	    && (page & _PAGE_PRESENT)
+	    && !(page & _PAGE_PSE)) {
+		page &= PAGE_MASK;
+		page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
+		                                         & (PTRS_PER_PTE - 1)];
+		printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
+	}
+
+	printk("\n");
+#else /* CONFIG_X86_64 */
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = (pgd_t *)read_cr3();
+
+	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
+	pgd += pgd_index(address);
+	if (bad_address(pgd)) goto bad;
+	printk("PGD %lx ", pgd_val(*pgd));
+	if (!pgd_present(*pgd)) goto ret;
+
+	pud = pud_offset(pgd, address);
+	if (bad_address(pud)) goto bad;
+	printk("PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud))	goto ret;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd)) goto bad;
+	printk("PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte)) goto bad;
+	printk("PTE %lx", pte_val(*pte));
+ret:
+	printk("\n");
+	return;
+bad:
+	printk("BAD\n");
+#endif
+}
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+	if (!pmd_present(*pmd)) {
+		set_pmd(pmd, *pmd_k);
+		arch_flush_lazy_mmu_mode();
+	} else
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+	return pmd_k;
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static const char errata93_warning[] =
+KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
+KERN_ERR "******* Please consider a BIOS update.\n"
+KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/* Workaround for K8 erratum #93 & buggy BIOS.
+   BIOS SMM functions are required to use a specific workaround
+   to avoid corruption of the 64bit RIP register on C stepping K8.
+   A lot of BIOS that didn't get tested properly miss this.
+   The OS sees this as a page fault with the upper 32bits of RIP cleared.
+   Try to work around it here.
+   Note we only handle faults in kernel here.
+   Does nothing for X86_32
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	static int warned;
+	if (address != regs->ip)
+		return 0;
+	if ((address >> 32) != 0)
+		return 0;
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) ||
+	    (address >= MODULES_VADDR && address <= MODULES_END)) {
+		if (!warned) {
+			printk(errata93_warning);
+			warned = 1;
+		}
+		regs->ip = address;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal
+ * addresses >4GB.  We catch this in the page fault handler because these
+ * addresses are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
+	    (address >> 32))
+		return 1;
+#endif
+	return 0;
+}
+
+void do_invalid_op(struct pt_regs *, unsigned long);
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+	unsigned long nr;
+	/*
+	 * Pentium F0 0F C7 C8 bug workaround.
+	 */
+	if (boot_cpu_data.f00f_bug) {
+		nr = (address - idt_descr.address) >> 3;
+
+		if (nr == 6) {
+			do_invalid_op(regs, 0);
+			return 1;
+		}
+	}
+#endif
+	return 0;
+}
+
+static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+			    unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	if (!oops_may_print())
+		return;
+#endif
+
+#ifdef CONFIG_X86_PAE
+	if (error_code & PF_INSTR) {
+		int level;
+		pte_t *pte = lookup_address(address, &level);
+
+		if (pte && pte_present(*pte) && !pte_exec(*pte))
+			printk(KERN_CRIT "kernel tried to execute "
+				"NX-protected page - exploit attempt? "
+				"(uid: %d)\n", current->uid);
+	}
+#endif
+
+	printk(KERN_ALERT "BUG: unable to handle kernel ");
+	if (address < PAGE_SIZE)
+		printk(KERN_CONT "NULL pointer dereference");
+	else
+		printk(KERN_CONT "paging request");
+#ifdef CONFIG_X86_32
+	printk(KERN_CONT " at %08lx\n", address);
+#else
+	printk(KERN_CONT " at %016lx\n", address);
+#endif
+	printk(KERN_ALERT "IP:");
+	printk_address(regs->ip, 1);
+	dump_pagetable(address);
+}
+
+#ifdef CONFIG_X86_64
+static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
+				 unsigned long error_code)
+{
+	unsigned long flags = oops_begin();
+	struct task_struct *tsk;
+
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       current->comm, address);
+	dump_pagetable(address);
+	tsk = current;
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
+	if (__die("Bad pagetable", regs, error_code))
+		regs = NULL;
+	oops_end(flags, regs, SIGKILL);
+}
+#endif
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.  This allows
+ * us to lazily refresh the TLB when increasing the permissions of a
+ * kernel page (RO -> RW or NX -> X).  Doing it eagerly is very
+ * expensive since that implies doing a full cross-processor TLB
+ * flush, even if no stale TLB entries exist on other processors.
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static int spurious_fault(unsigned long address,
+			  unsigned long error_code)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	/* Reserved-bit violation or user access to kernel space? */
+	if (error_code & (PF_USER | PF_RSVD))
+		return 0;
+
+	pgd = init_mm.pgd + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return 0;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte))
+		return 0;
+
+	if ((error_code & PF_WRITE) && !pte_write(*pte))
+		return 0;
+	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * X86_32
+ * Handle a fault on the vmalloc or module mapping area
+ *
+ * X86_64
+ * Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static int vmalloc_fault(unsigned long address)
+{
+#ifdef CONFIG_X86_32
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+	return 0;
+#else
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Copy kernel mappings over when needed. This can also
+	   happen within a race in page table update. In the later
+	   case just flush. */
+
+	pgd = pgd_offset(current->mm ?: &init_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+	if (pgd_none(*pgd))
+		set_pgd(pgd, *pgd_ref);
+	else
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+	/* Below here mismatches are bugs because these lower tables
+	   are shared */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+	pte = pte_offset_kernel(pmd, address);
+	/* Don't use pte_page here, because the mappings can point
+	   outside mem_map, and the NUMA hash lookup cannot handle
+	   that. */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+	return 0;
+#endif
+}
+
+int show_unhandled_signals = 1;
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	unsigned long address;
+	int write, si_code;
+	int fault;
+#ifdef CONFIG_X86_64
+	unsigned long flags;
+#endif
+
+	/*
+	 * We can fault from pretty much anywhere, with unknown IRQ state.
+	 */
+	trace_hardirqs_fixup();
+
+	tsk = current;
+	mm = tsk->mm;
+	prefetchw(&mm->mmap_sem);
+
+	/* get the address */
+	address = read_cr2();
+
+	si_code = SEGV_MAPERR;
+
+	if (notify_page_fault(regs))
+		return;
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 9) == 0.
+	 */
+#ifdef CONFIG_X86_32
+	if (unlikely(address >= TASK_SIZE)) {
+		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+		    vmalloc_fault(address) >= 0)
+			return;
+
+		/* Can handle a stale RO->RW TLB */
+		if (spurious_fault(address, error_code))
+			return;
+
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock.
+		 */
+		goto bad_area_nosemaphore;
+	}
+
+	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
+	   fault has been handled. */
+	if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+		local_irq_enable();
+
+	/*
+	 * If we're in an interrupt, have no user context or are running in an
+	 * atomic region then we must not take the fault.
+	 */
+	if (in_atomic() || !mm)
+		goto bad_area_nosemaphore;
+#else /* CONFIG_X86_64 */
+	if (unlikely(address >= TASK_SIZE64)) {
+		/*
+		 * Don't check for the module range here: its PML4
+		 * is always initialized because it's shared with the main
+		 * kernel text. Only vmalloc may need PML4 syncups.
+		 */
+		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
+			if (vmalloc_fault(address) >= 0)
+				return;
+		}
+
+		/* Can handle a stale RO->RW TLB */
+		if (spurious_fault(address, error_code))
+			return;
+
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock.
+		 */
+		goto bad_area_nosemaphore;
+	}
+	if (likely(regs->flags & X86_EFLAGS_IF))
+		local_irq_enable();
+
+	if (unlikely(error_code & PF_RSVD))
+		pgtable_bad(address, regs, error_code);
+
+	/*
+	 * If we're in an interrupt, have no user context or are running in an
+	 * atomic region then we must not take the fault.
+	 */
+	if (unlikely(in_atomic() || !mm))
+		goto bad_area_nosemaphore;
+
+	/*
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet.
+	 */
+	if (user_mode_vm(regs))
+		error_code |= PF_USER;
+again:
+#endif
+	/* When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in the
+	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
+	 * erroneous fault occurring in a code path which already holds mmap_sem
+	 * we will deadlock attempting to validate the fault against the
+	 * address space.  Luckily the kernel only validly references user
+	 * space from well defined areas of code, which are listed in the
+	 * exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibility of a deadlock.
+	 * Attempt to lock the address space, if we cannot we then validate the
+	 * source.  If this is invalid we can skip the address space check,
+	 * thus avoiding the deadlock.
+	 */
+	if (!down_read_trylock(&mm->mmap_sem)) {
+		if ((error_code & PF_USER) == 0 &&
+		    !search_exception_tables(regs->ip))
+			goto bad_area_nosemaphore;
+		down_read(&mm->mmap_sem);
+	}
+
+	vma = find_vma(mm, address);
+	if (!vma)
+		goto bad_area;
+	if (vma->vm_start <= address)
+		goto good_area;
+	if (!(vma->vm_flags & VM_GROWSDOWN))
+		goto bad_area;
+	if (error_code & PF_USER) {
+		/*
+		 * Accessing the stack below %sp is always a bug.
+		 * The large cushion allows instructions like enter
+		 * and pusha to work.  ("enter $65535,$31" pushes
+		 * 32 pointers and then decrements %sp by 65535.)
+		 */
+		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
+			goto bad_area;
+	}
+	if (expand_stack(vma, address))
+		goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+	si_code = SEGV_ACCERR;
+	write = 0;
+	switch (error_code & (PF_PROT|PF_WRITE)) {
+	default:	/* 3: write, present */
+		/* fall through */
+	case PF_WRITE:		/* write, not present */
+		if (!(vma->vm_flags & VM_WRITE))
+			goto bad_area;
+		write++;
+		break;
+	case PF_PROT:		/* read, present */
+		goto bad_area;
+	case 0:			/* read, not present */
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+			goto bad_area;
+	}
+
+#ifdef CONFIG_X86_32
+survive:
+#endif
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.
+	 */
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
+			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
+	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Did it hit the DOS screen memory VA from vm86 mode?
+	 */
+	if (v8086_mode(regs)) {
+		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+		if (bit < 32)
+			tsk->thread.screen_bitmap |= 1 << bit;
+	}
+#endif
+	up_read(&mm->mmap_sem);
+	return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+	up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here.
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space.
+		 */
+		if (is_prefetch(regs, address, error_code))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk(
+#ifdef CONFIG_X86_32
+			"%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
+#else
+			"%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx",
+#endif
+			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+			tsk->comm, task_pid_nr(tsk), address, regs->ip,
+			regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}
+
+		tsk->thread.cr2 = address;
+		/* Kernel addresses are always protection faults */
+		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no = 14;
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+no_context:
+	/* Are we prepared to handle this kernel fault?  */
+	if (fixup_exception(regs))
+		return;
+
+	/*
+	 * X86_32
+	 * Valid to do another page fault here, because if this fault
+	 * had been triggered by is_prefetch fixup_exception would have
+	 * handled it.
+	 *
+	 * X86_64
+	 * Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, address, error_code))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+#ifdef CONFIG_X86_32
+	bust_spinlocks(1);
+#else
+	flags = oops_begin();
+#endif
+
+	show_fault_oops(regs, error_code, address);
+
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+	die("Oops", regs, error_code);
+	bust_spinlocks(0);
+	do_exit(SIGKILL);
+#else
+	if (__die("Oops", regs, error_code))
+		regs = NULL;
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+	oops_end(flags, regs, SIGKILL);
+#endif
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+	up_read(&mm->mmap_sem);
+	if (is_global_init(tsk)) {
+		yield();
+#ifdef CONFIG_X86_32
+		down_read(&mm->mmap_sem);
+		goto survive;
+#else
+		goto again;
+#endif
+	}
+
+	printk("VM: killing process %s\n", tsk->comm);
+	if (error_code & PF_USER)
+		do_group_exit(SIGKILL);
+	goto no_context;
+
+do_sigbus:
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & PF_USER))
+		goto no_context;
+#ifdef CONFIG_X86_32
+	/* User space => ok to do another page fault */
+	if (is_prefetch(regs, address, error_code))
+		return;
+#endif
+	tsk->thread.cr2 = address;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 14;
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+void vmalloc_sync_all(void)
+{
+#ifdef CONFIG_X86_32
+	/*
+	 * Note that races in the updates of insync and start aren't
+	 * problematic: insync can only get set bits added, and updates to
+	 * start are only improving performance (without affecting correctness
+	 * if undone).
+	 */
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+	static unsigned long start = TASK_SIZE;
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
+	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
+		if (!test_bit(pgd_index(address), insync)) {
+			unsigned long flags;
+			struct page *page;
+
+			spin_lock_irqsave(&pgd_lock, flags);
+			list_for_each_entry(page, &pgd_list, lru) {
+				if (!vmalloc_sync_one(page_address(page),
+						      address))
+					break;
+			}
+			spin_unlock_irqrestore(&pgd_lock, flags);
+			if (!page)
+				set_bit(pgd_index(address), insync);
+		}
+		if (address == start && test_bit(pgd_index(address), insync))
+			start = address + PGDIR_SIZE;
+	}
+#else /* CONFIG_X86_64 */
+	/*
+	 * Note that races in the updates of insync and start aren't
+	 * problematic: insync can only get set bits added, and updates to
+	 * start are only improving performance (without affecting correctness
+	 * if undone).
+	 */
+	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
+	static unsigned long start = VMALLOC_START & PGDIR_MASK;
+	unsigned long address;
+
+	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
+		if (!test_bit(pgd_index(address), insync)) {
+			const pgd_t *pgd_ref = pgd_offset_k(address);
+			struct page *page;
+
+			if (pgd_none(*pgd_ref))
+				continue;
+			spin_lock(&pgd_lock);
+			list_for_each_entry(page, &pgd_list, lru) {
+				pgd_t *pgd;
+				pgd = (pgd_t *)page_address(page) + pgd_index(address);
+				if (pgd_none(*pgd))
+					set_pgd(pgd, *pgd_ref);
+				else
+					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+			}
+			spin_unlock(&pgd_lock);
+			set_bit(pgd_index(address), insync);
+		}
+		if (address == start)
+			start = address + PGDIR_SIZE;
+	}
+	/* Check that there is no need to do the same for the modules area. */
+	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+				(__START_KERNEL & PGDIR_MASK)));
+#endif
+}
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c
deleted file mode 100644
index a2273d44aa2..00000000000
--- a/arch/x86/mm/fault_32.c
+++ /dev/null
@@ -1,659 +0,0 @@
-/*
- *  linux/arch/i386/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/highmem.h>
-#include <linux/bootmem.h>		/* for max_low_pfn */
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/desc.h>
-#include <asm/segment.h>
-
-extern void die(const char *,struct pt_regs *,long);
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode_vm(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
-
-/*
- * Return EIP plus the CS segment base.  The segment limit is also
- * adjusted, clamped to the kernel/user address space (whichever is
- * appropriate), and returned in *eip_limit.
- *
- * The segment is checked, because it might have been changed by another
- * task between the original faulting instruction and here.
- *
- * If CS is no longer a valid code segment, or if EIP is beyond the
- * limit, or if it is a kernel address when CS is not a kernel segment,
- * then the returned value will be greater than *eip_limit.
- * 
- * This is slow, but is very rarely executed.
- */
-static inline unsigned long get_segment_eip(struct pt_regs *regs,
-					    unsigned long *eip_limit)
-{
-	unsigned long eip = regs->eip;
-	unsigned seg = regs->xcs & 0xffff;
-	u32 seg_ar, seg_limit, base, *desc;
-
-	/* Unlikely, but must come before segment checks. */
-	if (unlikely(regs->eflags & VM_MASK)) {
-		base = seg << 4;
-		*eip_limit = base + 0xffff;
-		return base + (eip & 0xffff);
-	}
-
-	/* The standard kernel/user address space limit. */
-	*eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
-	
-	/* By far the most common cases. */
-	if (likely(SEGMENT_IS_FLAT_CODE(seg)))
-		return eip;
-
-	/* Check the segment exists, is within the current LDT/GDT size,
-	   that kernel/user (ring 0..3) has the appropriate privilege,
-	   that it's a code segment, and get the limit. */
-	__asm__ ("larl %3,%0; lsll %3,%1"
-		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
-	if ((~seg_ar & 0x9800) || eip > seg_limit) {
-		*eip_limit = 0;
-		return 1;	 /* So that returned eip > *eip_limit. */
-	}
-
-	/* Get the GDT/LDT descriptor base. 
-	   When you look for races in this code remember that
-	   LDT and other horrors are only used in user space. */
-	if (seg & (1<<2)) {
-		/* Must lock the LDT while reading it. */
-		mutex_lock(&current->mm->context.lock);
-		desc = current->mm->context.ldt;
-		desc = (void *)desc + (seg & ~7);
-	} else {
-		/* Must disable preemption while reading the GDT. */
- 		desc = (u32 *)get_cpu_gdt_table(get_cpu());
-		desc = (void *)desc + (seg & ~7);
-	}
-
-	/* Decode the code segment base from the descriptor */
-	base = get_desc_base((unsigned long *)desc);
-
-	if (seg & (1<<2)) { 
-		mutex_unlock(&current->mm->context.lock);
-	} else
-		put_cpu();
-
-	/* Adjust EIP and segment limit, and clamp at the kernel limit.
-	   It's legitimate for segments to wrap at 0xffffffff. */
-	seg_limit += base;
-	if (seg_limit < *eip_limit && seg_limit >= base)
-		*eip_limit = seg_limit;
-	return eip + base;
-}
-
-/* 
- * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
- */
-static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
-{ 
-	unsigned long limit;
-	unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
-	int scan_more = 1;
-	int prefetch = 0; 
-	int i;
-
-	for (i = 0; scan_more && i < 15; i++) { 
-		unsigned char opcode;
-		unsigned char instr_hi;
-		unsigned char instr_lo;
-
-		if (instr > (unsigned char *)limit)
-			break;
-		if (probe_kernel_address(instr, opcode))
-			break; 
-
-		instr_hi = opcode & 0xf0; 
-		instr_lo = opcode & 0x0f; 
-		instr++;
-
-		switch (instr_hi) { 
-		case 0x20:
-		case 0x30:
-			/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
-			scan_more = ((instr_lo & 7) == 0x6);
-			break;
-			
-		case 0x60:
-			/* 0x64 thru 0x67 are valid prefixes in all modes. */
-			scan_more = (instr_lo & 0xC) == 0x4;
-			break;		
-		case 0xF0:
-			/* 0xF0, 0xF2, and 0xF3 are valid prefixes */
-			scan_more = !instr_lo || (instr_lo>>1) == 1;
-			break;			
-		case 0x00:
-			/* Prefetch instruction is 0x0F0D or 0x0F18 */
-			scan_more = 0;
-			if (instr > (unsigned char *)limit)
-				break;
-			if (probe_kernel_address(instr, opcode))
-				break;
-			prefetch = (instr_lo == 0xF) &&
-				(opcode == 0x0D || opcode == 0x18);
-			break;			
-		default:
-			scan_more = 0;
-			break;
-		} 
-	}
-	return prefetch;
-}
-
-static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-			      unsigned long error_code)
-{
-	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		     boot_cpu_data.x86 >= 6)) {
-		/* Catch an obscure case of prefetch inside an NX page. */
-		if (nx_enabled && (error_code & 16))
-			return 0;
-		return __is_prefetch(regs, addr);
-	}
-	return 0;
-} 
-
-static noinline void force_sig_info_fault(int si_signo, int si_code,
-	unsigned long address, struct task_struct *tsk)
-{
-	siginfo_t info;
-
-	info.si_signo = si_signo;
-	info.si_errno = 0;
-	info.si_code = si_code;
-	info.si_addr = (void __user *)address;
-	force_sig_info(si_signo, &info, tsk);
-}
-
-fastcall void do_invalid_op(struct pt_regs *, unsigned long);
-
-static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
-{
-	unsigned index = pgd_index(address);
-	pgd_t *pgd_k;
-	pud_t *pud, *pud_k;
-	pmd_t *pmd, *pmd_k;
-
-	pgd += index;
-	pgd_k = init_mm.pgd + index;
-
-	if (!pgd_present(*pgd_k))
-		return NULL;
-
-	/*
-	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
-	 * and redundant with the set_pmd() on non-PAE. As would
-	 * set_pud.
-	 */
-
-	pud = pud_offset(pgd, address);
-	pud_k = pud_offset(pgd_k, address);
-	if (!pud_present(*pud_k))
-		return NULL;
-
-	pmd = pmd_offset(pud, address);
-	pmd_k = pmd_offset(pud_k, address);
-	if (!pmd_present(*pmd_k))
-		return NULL;
-	if (!pmd_present(*pmd)) {
-		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else
-		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	return pmd_k;
-}
-
-/*
- * Handle a fault on the vmalloc or module mapping area
- *
- * This assumes no large pages in there.
- */
-static inline int vmalloc_fault(unsigned long address)
-{
-	unsigned long pgd_paddr;
-	pmd_t *pmd_k;
-	pte_t *pte_k;
-	/*
-	 * Synchronize this task's top level page-table
-	 * with the 'reference' page table.
-	 *
-	 * Do _not_ use "current" here. We might be inside
-	 * an interrupt in the middle of a task switch..
-	 */
-	pgd_paddr = read_cr3();
-	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-	if (!pmd_k)
-		return -1;
-	pte_k = pte_offset_kernel(pmd_k, address);
-	if (!pte_present(*pte_k))
-		return -1;
-	return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- *
- * error_code:
- *	bit 0 == 0 means no page found, 1 means protection fault
- *	bit 1 == 0 means read, 1 means write
- *	bit 2 == 0 means kernel, 1 means user-mode
- *	bit 3 == 1 means use of reserved bit detected
- *	bit 4 == 1 means fault was an instruction fetch
- */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	unsigned long address;
-	int write, si_code;
-	int fault;
-
-	/*
-	 * We can fault from pretty much anywhere, with unknown IRQ state.
-	 */
-	trace_hardirqs_fixup();
-
-	/* get the address */
-        address = read_cr2();
-
-	tsk = current;
-
-	si_code = SEGV_MAPERR;
-
-	/*
-	 * We fault-in kernel-space virtual memory on-demand. The
-	 * 'reference' page table is init_mm.pgd.
-	 *
-	 * NOTE! We MUST NOT take any locks for this case. We may
-	 * be in an interrupt or a critical region, and should
-	 * only copy the information from the master page table,
-	 * nothing more.
-	 *
-	 * This verifies that the fault happens in kernel space
-	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 9) == 0.
-	 */
-	if (unlikely(address >= TASK_SIZE)) {
-		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)
-			return;
-		if (notify_page_fault(regs))
-			return;
-		/*
-		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
-		 */
-		goto bad_area_nosemaphore;
-	}
-
-	if (notify_page_fault(regs))
-		return;
-
-	/* It's safe to allow irq's after cr2 has been saved and the vmalloc
-	   fault has been handled. */
-	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-		local_irq_enable();
-
-	mm = tsk->mm;
-
-	/*
-	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault..
-	 */
-	if (in_atomic() || !mm)
-		goto bad_area_nosemaphore;
-
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & 4) == 0 &&
-		    !search_exception_tables(regs->eip))
-			goto bad_area_nosemaphore;
-		down_read(&mm->mmap_sem);
-	}
-
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (error_code & 4) {
-		/*
-		 * Accessing the stack below %esp is always a bug.
-		 * The large cushion allows instructions like enter
-		 * and pusha to work.  ("enter $65535,$31" pushes
-		 * 32 pointers and then decrements %esp by 65535.)
-		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-	si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & 3) {
-		default:	/* 3: write, present */
-				/* fall through */
-		case 2:		/* write, not present */
-			if (!(vma->vm_flags & VM_WRITE))
-				goto bad_area;
-			write++;
-			break;
-		case 1:		/* read, present */
-			goto bad_area;
-		case 0:		/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-				goto bad_area;
-	}
-
- survive:
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, write);
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
-	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
-
-	/*
-	 * Did it hit the DOS screen memory VA from vm86 mode?
-	 */
-	if (regs->eflags & VM_MASK) {
-		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
-		if (bit < 32)
-			tsk->thread.screen_bitmap |= 1 << bit;
-	}
-	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & 4) {
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		/* 
-		 * Valid to do another page fault here because this one came 
-		 * from user space.
-		 */
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk("%s%s[%d]: segfault at %08lx eip %08lx "
-			    "esp %08lx error %lx\n",
-			    task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			    tsk->comm, task_pid_nr(tsk), address, regs->eip,
-			    regs->esp, error_code);
-		}
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-		return;
-	}
-
-#ifdef CONFIG_X86_F00F_BUG
-	/*
-	 * Pentium F0 0F C7 C8 bug workaround.
-	 */
-	if (boot_cpu_data.f00f_bug) {
-		unsigned long nr;
-		
-		nr = (address - idt_descr.address) >> 3;
-
-		if (nr == 6) {
-			do_invalid_op(regs, 0);
-			return;
-		}
-	}
-#endif
-
-no_context:
-	/* Are we prepared to handle this kernel fault?  */
-	if (fixup_exception(regs))
-		return;
-
-	/* 
-	 * Valid to do another page fault here, because if this fault
-	 * had been triggered by is_prefetch fixup_exception would have 
-	 * handled it.
-	 */
- 	if (is_prefetch(regs, address, error_code))
- 		return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-	bust_spinlocks(1);
-
-	if (oops_may_print()) {
-		__typeof__(pte_val(__pte(0))) page;
-
-#ifdef CONFIG_X86_PAE
-		if (error_code & 16) {
-			pte_t *pte = lookup_address(address);
-
-			if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
-				printk(KERN_CRIT "kernel tried to execute "
-					"NX-protected page - exploit attempt? "
-					"(uid: %d)\n", current->uid);
-		}
-#endif
-		if (address < PAGE_SIZE)
-			printk(KERN_ALERT "BUG: unable to handle kernel NULL "
-					"pointer dereference");
-		else
-			printk(KERN_ALERT "BUG: unable to handle kernel paging"
-					" request");
-		printk(" at virtual address %08lx\n",address);
-		printk(KERN_ALERT "printing eip: %08lx ", regs->eip);
-
-		page = read_cr3();
-		page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT];
-#ifdef CONFIG_X86_PAE
-		printk("*pdpt = %016Lx ", page);
-		if ((page >> PAGE_SHIFT) < max_low_pfn
-		    && page & _PAGE_PRESENT) {
-			page &= PAGE_MASK;
-			page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT)
-			                                         & (PTRS_PER_PMD - 1)];
-			printk(KERN_CONT "*pde = %016Lx ", page);
-			page &= ~_PAGE_NX;
-		}
-#else
-		printk("*pde = %08lx ", page);
-#endif
-
-		/*
-		 * We must not directly access the pte in the highpte
-		 * case if the page table is located in highmem.
-		 * And let's rather not kmap-atomic the pte, just in case
-		 * it's allocated already.
-		 */
-		if ((page >> PAGE_SHIFT) < max_low_pfn
-		    && (page & _PAGE_PRESENT)
-		    && !(page & _PAGE_PSE)) {
-			page &= PAGE_MASK;
-			page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT)
-			                                         & (PTRS_PER_PTE - 1)];
-			printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page);
-		}
-
-		printk("\n");
-	}
-
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	die("Oops", regs, error_code);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(tsk)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (error_code & 4)
-		do_group_exit(SIGKILL);
-	goto no_context;
-
-do_sigbus:
-	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & 4))
-		goto no_context;
-
-	/* User space => ok to do another page fault */
-	if (is_prefetch(regs, address, error_code))
-		return;
-
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
-}
-
-void vmalloc_sync_all(void)
-{
-	/*
-	 * Note that races in the updates of insync and start aren't
-	 * problematic: insync can only get set bits added, and updates to
-	 * start are only improving performance (without affecting correctness
-	 * if undone).
-	 */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = TASK_SIZE;
-	unsigned long address;
-
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
-	for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			unsigned long flags;
-			struct page *page;
-
-			spin_lock_irqsave(&pgd_lock, flags);
-			for (page = pgd_list; page; page =
-					(struct page *)page->index)
-				if (!vmalloc_sync_one(page_address(page),
-								address)) {
-					BUG_ON(page != pgd_list);
-					break;
-				}
-			spin_unlock_irqrestore(&pgd_lock, flags);
-			if (!page)
-				set_bit(pgd_index(address), insync);
-		}
-		if (address == start && test_bit(pgd_index(address), insync))
-			start = address + PGDIR_SIZE;
-	}
-}
diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c
deleted file mode 100644
index 0e26230669c..00000000000
--- a/arch/x86/mm/fault_64.c
+++ /dev/null
@@ -1,623 +0,0 @@
-/*
- *  linux/arch/x86-64/mm/fault.c
- *
- *  Copyright (C) 1995  Linus Torvalds
- *  Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
- */
-
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/tty.h>
-#include <linux/vt_kern.h>		/* For unblank_screen() */
-#include <linux/compiler.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kprobes.h>
-
-#include <asm/system.h>
-#include <asm/pgalloc.h>
-#include <asm/smp.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm-generic/sections.h>
-
-/* Page fault error code bits */
-#define PF_PROT	(1<<0)		/* or no page found */
-#define PF_WRITE	(1<<1)
-#define PF_USER	(1<<2)
-#define PF_RSVD	(1<<3)
-#define PF_INSTR	(1<<4)
-
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	return 0;
-}
-#endif
-
-/* Sometimes the CPU reports invalid exceptions on prefetch.
-   Check that here and ignore.
-   Opcode checker based on code by Richard Brunner */
-static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
-				unsigned long error_code)
-{ 
-	unsigned char *instr;
-	int scan_more = 1;
-	int prefetch = 0; 
-	unsigned char *max_instr;
-
-	/* If it was a exec fault ignore */
-	if (error_code & PF_INSTR)
-		return 0;
-	
-	instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
-	max_instr = instr + 15;
-
-	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
-		return 0;
-
-	while (scan_more && instr < max_instr) { 
-		unsigned char opcode;
-		unsigned char instr_hi;
-		unsigned char instr_lo;
-
-		if (probe_kernel_address(instr, opcode))
-			break; 
-
-		instr_hi = opcode & 0xf0; 
-		instr_lo = opcode & 0x0f; 
-		instr++;
-
-		switch (instr_hi) { 
-		case 0x20:
-		case 0x30:
-			/* Values 0x26,0x2E,0x36,0x3E are valid x86
-			   prefixes.  In long mode, the CPU will signal
-			   invalid opcode if some of these prefixes are
-			   present so we will never get here anyway */
-			scan_more = ((instr_lo & 7) == 0x6);
-			break;
-			
-		case 0x40:
-			/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
-			   Need to figure out under what instruction mode the
-			   instruction was issued ... */
-			/* Could check the LDT for lm, but for now it's good
-			   enough to assume that long mode only uses well known
-			   segments or kernel. */
-			scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
-			break;
-			
-		case 0x60:
-			/* 0x64 thru 0x67 are valid prefixes in all modes. */
-			scan_more = (instr_lo & 0xC) == 0x4;
-			break;		
-		case 0xF0:
-			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
-			scan_more = !instr_lo || (instr_lo>>1) == 1;
-			break;			
-		case 0x00:
-			/* Prefetch instruction is 0x0F0D or 0x0F18 */
-			scan_more = 0;
-			if (probe_kernel_address(instr, opcode))
-				break;
-			prefetch = (instr_lo == 0xF) &&
-				(opcode == 0x0D || opcode == 0x18);
-			break;			
-		default:
-			scan_more = 0;
-			break;
-		} 
-	}
-	return prefetch;
-}
-
-static int bad_address(void *p) 
-{ 
-	unsigned long dummy;
-	return probe_kernel_address((unsigned long *)p, dummy);
-} 
-
-void dump_pagetable(unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
-	pgd = (pgd_t *)read_cr3();
-
-	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); 
-	pgd += pgd_index(address);
-	if (bad_address(pgd)) goto bad;
-	printk("PGD %lx ", pgd_val(*pgd));
-	if (!pgd_present(*pgd)) goto ret; 
-
-	pud = pud_offset(pgd, address);
-	if (bad_address(pud)) goto bad;
-	printk("PUD %lx ", pud_val(*pud));
-	if (!pud_present(*pud))	goto ret;
-
-	pmd = pmd_offset(pud, address);
-	if (bad_address(pmd)) goto bad;
-	printk("PMD %lx ", pmd_val(*pmd));
-	if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
-
-	pte = pte_offset_kernel(pmd, address);
-	if (bad_address(pte)) goto bad;
-	printk("PTE %lx", pte_val(*pte)); 
-ret:
-	printk("\n");
-	return;
-bad:
-	printk("BAD\n");
-}
-
-static const char errata93_warning[] = 
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
-
-/* Workaround for K8 erratum #93 & buggy BIOS.
-   BIOS SMM functions are required to use a specific workaround
-   to avoid corruption of the 64bit RIP register on C stepping K8. 
-   A lot of BIOS that didn't get tested properly miss this. 
-   The OS sees this as a page fault with the upper 32bits of RIP cleared.
-   Try to work around it here.
-   Note we only handle faults in kernel here. */
-
-static int is_errata93(struct pt_regs *regs, unsigned long address) 
-{
-	static int warned;
-	if (address != regs->rip)
-		return 0;
-	if ((address >> 32) != 0) 
-		return 0;
-	address |= 0xffffffffUL << 32;
-	if ((address >= (u64)_stext && address <= (u64)_etext) || 
-	    (address >= MODULES_VADDR && address <= MODULES_END)) { 
-		if (!warned) {
-			printk(errata93_warning); 		
-			warned = 1;
-		}
-		regs->rip = address;
-		return 1;
-	}
-	return 0;
-} 
-
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-				 unsigned long error_code)
-{
-	unsigned long flags = oops_begin();
-	struct task_struct *tsk;
-
-	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-	       current->comm, address);
-	dump_pagetable(address);
-	tsk = current;
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	__die("Bad pagetable", regs, error_code);
-	oops_end(flags);
-	do_exit(SIGKILL);
-}
-
-/*
- * Handle a fault on the vmalloc area
- *
- * This assumes no large pages in there.
- */
-static int vmalloc_fault(unsigned long address)
-{
-	pgd_t *pgd, *pgd_ref;
-	pud_t *pud, *pud_ref;
-	pmd_t *pmd, *pmd_ref;
-	pte_t *pte, *pte_ref;
-
-	/* Copy kernel mappings over when needed. This can also
-	   happen within a race in page table update. In the later
-	   case just flush. */
-
-	pgd = pgd_offset(current->mm ?: &init_mm, address);
-	pgd_ref = pgd_offset_k(address);
-	if (pgd_none(*pgd_ref))
-		return -1;
-	if (pgd_none(*pgd))
-		set_pgd(pgd, *pgd_ref);
-	else
-		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-
-	/* Below here mismatches are bugs because these lower tables
-	   are shared */
-
-	pud = pud_offset(pgd, address);
-	pud_ref = pud_offset(pgd_ref, address);
-	if (pud_none(*pud_ref))
-		return -1;
-	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
-		BUG();
-	pmd = pmd_offset(pud, address);
-	pmd_ref = pmd_offset(pud_ref, address);
-	if (pmd_none(*pmd_ref))
-		return -1;
-	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
-		BUG();
-	pte_ref = pte_offset_kernel(pmd_ref, address);
-	if (!pte_present(*pte_ref))
-		return -1;
-	pte = pte_offset_kernel(pmd, address);
-	/* Don't use pte_page here, because the mappings can point
-	   outside mem_map, and the NUMA hash lookup cannot handle
-	   that. */
-	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
-		BUG();
-	return 0;
-}
-
-int show_unhandled_signals = 1;
-
-/*
- * This routine handles page faults.  It determines the address,
- * and the problem, and then passes it off to one of the appropriate
- * routines.
- */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-					unsigned long error_code)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	struct vm_area_struct * vma;
-	unsigned long address;
-	const struct exception_table_entry *fixup;
-	int write, fault;
-	unsigned long flags;
-	siginfo_t info;
-
-	/*
-	 * We can fault from pretty much anywhere, with unknown IRQ state.
-	 */
-	trace_hardirqs_fixup();
-
-	tsk = current;
-	mm = tsk->mm;
-	prefetchw(&mm->mmap_sem);
-
-	/* get the address */
-	address = read_cr2();
-
-	info.si_code = SEGV_MAPERR;
-
-
-	/*
-	 * We fault-in kernel-space virtual memory on-demand. The
-	 * 'reference' page table is init_mm.pgd.
-	 *
-	 * NOTE! We MUST NOT take any locks for this case. We may
-	 * be in an interrupt or a critical region, and should
-	 * only copy the information from the master page table,
-	 * nothing more.
-	 *
-	 * This verifies that the fault happens in kernel space
-	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 9) == 0.
-	 */
-	if (unlikely(address >= TASK_SIZE64)) {
-		/*
-		 * Don't check for the module range here: its PML4
-		 * is always initialized because it's shared with the main
-		 * kernel text. Only vmalloc may need PML4 syncups.
-		 */
-		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
-			if (vmalloc_fault(address) >= 0)
-				return;
-		}
-		if (notify_page_fault(regs))
-			return;
-		/*
-		 * Don't take the mm semaphore here. If we fixup a prefetch
-		 * fault we could otherwise deadlock.
-		 */
-		goto bad_area_nosemaphore;
-	}
-
-	if (notify_page_fault(regs))
-		return;
-
-	if (likely(regs->eflags & X86_EFLAGS_IF))
-		local_irq_enable();
-
-	if (unlikely(error_code & PF_RSVD))
-		pgtable_bad(address, regs, error_code);
-
-	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
-	 */
-	if (unlikely(in_atomic() || !mm))
-		goto bad_area_nosemaphore;
-
-	/*
-	 * User-mode registers count as a user access even for any
-	 * potential system fault or CPU buglet.
-	 */
-	if (user_mode_vm(regs))
-		error_code |= PF_USER;
-
- again:
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->rip))
-			goto bad_area_nosemaphore;
-		down_read(&mm->mmap_sem);
-	}
-
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (likely(vma->vm_start <= address))
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-	if (error_code & 4) {
-		/* Allow userspace just enough access below the stack pointer
-		 * to let the 'enter' instruction work.
-		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
-good_area:
-	info.si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & (PF_PROT|PF_WRITE)) {
-		default:	/* 3: write, present */
-			/* fall through */
-		case PF_WRITE:		/* write, not present */
-			if (!(vma->vm_flags & VM_WRITE))
-				goto bad_area;
-			write++;
-			break;
-		case PF_PROT:		/* read, present */
-			goto bad_area;
-		case 0:			/* read, not present */
-			if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-				goto bad_area;
-	}
-
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, write);
-	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
-	}
-	if (fault & VM_FAULT_MAJOR)
-		tsk->maj_flt++;
-	else
-		tsk->min_flt++;
-	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
-
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		/* Work around K8 erratum #100 K8 in compat mode
-		   occasionally jumps to illegal addresses >4GB.  We
-		   catch this here in the page fault handler because
-		   these addresses are not reachable. Just detect this
-		   case and return.  Any code segment in LDT is
-		   compatibility mode. */
-		if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
-		    (address >> 32))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(
-		       "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n",
-					tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
-					tsk->comm, tsk->pid, address, regs->rip,
-					regs->rsp, error_code);
-		}
-       
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		info.si_signo = SIGSEGV;
-		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void __user *)address;
-		force_sig_info(SIGSEGV, &info, tsk);
-		return;
-	}
-
-no_context:
-	
-	/* Are we prepared to handle this kernel fault?  */
-	fixup = search_exception_tables(regs->rip);
-	if (fixup) {
-		regs->rip = fixup->fixup;
-		return;
-	}
-
-	/* 
-	 * Hall of shame of CPU/BIOS bugs.
-	 */
-
- 	if (is_prefetch(regs, address, error_code))
- 		return;
-
-	if (is_errata93(regs, address))
-		return; 
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-
-	flags = oops_begin();
-
-	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
-	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
-	printk(" at %016lx RIP: \n" KERN_ALERT,address);
-	printk_address(regs->rip);
-	dump_pagetable(address);
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-	__die("Oops", regs, error_code);
-	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags);
-	do_exit(SIGKILL);
-
-/*
- * We ran out of memory, or some other thing happened to us that made
- * us unable to handle the page fault gracefully.
- */
-out_of_memory:
-	up_read(&mm->mmap_sem);
-	if (is_global_init(current)) {
-		yield();
-		goto again;
-	}
-	printk("VM: killing process %s\n", tsk->comm);
-	if (error_code & 4)
-		do_group_exit(SIGKILL);
-	goto no_context;
-
-do_sigbus:
-	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & PF_USER))
-		goto no_context;
-
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	info.si_signo = SIGBUS;
-	info.si_errno = 0;
-	info.si_code = BUS_ADRERR;
-	info.si_addr = (void __user *)address;
-	force_sig_info(SIGBUS, &info, tsk);
-	return;
-}
-
-DEFINE_SPINLOCK(pgd_lock);
-LIST_HEAD(pgd_list);
-
-void vmalloc_sync_all(void)
-{
-	/* Note that races in the updates of insync and start aren't 
-	   problematic:
-	   insync can only get set bits added, and updates to start are only
-	   improving performance (without affecting correctness if undone). */
-	static DECLARE_BITMAP(insync, PTRS_PER_PGD);
-	static unsigned long start = VMALLOC_START & PGDIR_MASK;
-	unsigned long address;
-
-	for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
-		if (!test_bit(pgd_index(address), insync)) {
-			const pgd_t *pgd_ref = pgd_offset_k(address);
-			struct page *page;
-
-			if (pgd_none(*pgd_ref))
-				continue;
-			spin_lock(&pgd_lock);
-			list_for_each_entry(page, &pgd_list, lru) {
-				pgd_t *pgd;
-				pgd = (pgd_t *)page_address(page) + pgd_index(address);
-				if (pgd_none(*pgd))
-					set_pgd(pgd, *pgd_ref);
-				else
-					BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
-			}
-			spin_unlock(&pgd_lock);
-			set_bit(pgd_index(address), insync);
-		}
-		if (address == start)
-			start = address + PGDIR_SIZE;
-	}
-	/* Check that there is no need to do the same for the modules area. */
-	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
-	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 
-				(__START_KERNEL & PGDIR_MASK)));
-}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 1c3bf95f735..3d936f23270 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -18,6 +18,49 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+static void debug_kmap_atomic_prot(enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+	static unsigned warn_count = 10;
+
+	if (unlikely(warn_count == 0))
+		return;
+
+	if (unlikely(in_interrupt())) {
+		if (in_irq()) {
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		} else if (!irqs_disabled()) {	/* softirq */
+			if (type != KM_IRQ0 && type != KM_IRQ1 &&
+			    type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
+			    type != KM_SKB_SUNRPC_DATA &&
+			    type != KM_SKB_DATA_SOFTIRQ &&
+			    type != KM_BOUNCE_READ) {
+				WARN_ON(1);
+				warn_count--;
+			}
+		}
+	}
+
+	if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
+			type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+		if (!irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	} else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
+		if (irq_count() == 0 && !irqs_disabled()) {
+			WARN_ON(1);
+			warn_count--;
+		}
+	}
+#endif
+}
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
-
 	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+
+	debug_kmap_atomic_prot(type);
+
 	pagefault_disable();
 
 	if (!PageHighMem(page))
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 6c06d9c0488..4fbafb4bc2f 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -15,6 +15,7 @@
 #include <asm/mman.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 static unsigned long page_table_shareable(struct vm_area_struct *svma,
 				struct vm_area_struct *vma,
@@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 	spin_lock(&mm->page_table_lock);
 	if (pud_none(*pud))
-		pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+		pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
 	else
 		put_page(virt_to_page(spte));
 	spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3c76d194fd2..da524fb2242 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -27,7 +27,6 @@
 #include <linux/bootmem.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
-#include <linux/efi.h>
 #include <linux/memory_hotplug.h>
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
@@ -40,8 +39,10 @@
 #include <asm/fixmap.h>
 #include <asm/e820.h>
 #include <asm/apic.h>
+#include <asm/bugs.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/paravirt.h>
 
@@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
-static int noinline do_test_wp_bit(void);
+static noinline int do_test_wp_bit(void);
 
 /*
  * Creates a middle page table and puts a pointer to it in the
@@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 {
 	pud_t *pud;
 	pmd_t *pmd_table;
-		
+
 #ifdef CONFIG_X86_PAE
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
 		pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+		paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
 		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
 		pud = pud_offset(pgd, 0);
-		if (pmd_table != pmd_offset(pud, 0))
-			BUG();
+		BUG_ON(pmd_table != pmd_offset(pud, 0));
 	}
 #endif
 	pud = pud_offset(pgd, 0);
 	pmd_table = pmd_offset(pud, 0);
+
 	return pmd_table;
 }
 
 /*
  * Create a page table and place a pointer to it in a middle page
- * directory entry.
+ * directory entry:
  */
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
@@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
-		if (!page_table)
+		if (!page_table) {
 			page_table =
 				(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+		}
 
 		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 }
 
 /*
- * This function initializes a certain range of kernel virtual memory 
+ * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
  * the given range.
- */
-
-/*
- * NOTE: The pagetables are allocated contiguous on the physical space 
- * so we can cache the place of the first one and move around without 
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
  * checking the pgd every time.
  */
-static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 {
-	pgd_t *pgd;
-	pmd_t *pmd;
 	int pgd_idx, pmd_idx;
 	unsigned long vaddr;
+	pgd_t *pgd;
+	pmd_t *pmd;
 
 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
@@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end
 	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
 		pmd = one_md_table_init(pgd);
 		pmd = pmd + pmd_index(vaddr);
-		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+							pmd++, pmd_idx++) {
 			one_page_table_init(pmd);
 
 			vaddr += PMD_SIZE;
@@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr)
 }
 
 /*
- * This maps the physical memory to kernel virtual address space, a total 
- * of max_low_pfn pages, by creating page tables starting from address 
- * PAGE_OFFSET.
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
  */
 static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 {
+	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	int pgd_idx, pmd_idx, pte_ofs;
 
 	pgd_idx = pgd_index(PAGE_OFFSET);
 	pgd = pgd_base + pgd_idx;
@@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		pmd = one_md_table_init(pgd);
 		if (pfn >= max_low_pfn)
 			continue;
-		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
-			unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
 
-			/* Map with big pages if possible, otherwise create normal page tables. */
+		for (pmd_idx = 0;
+		     pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn;
+		     pmd++, pmd_idx++) {
+			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+			/*
+			 * Map with big pages if possible, otherwise
+			 * create normal page tables:
+			 */
 			if (cpu_has_pse) {
-				unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-				if (is_kernel_text(address) || is_kernel_text(address2))
-					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
-				else
-					set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
+				unsigned int addr2;
+				pgprot_t prot = PAGE_KERNEL_LARGE;
+
+				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+					PAGE_OFFSET + PAGE_SIZE-1;
+
+				if (is_kernel_text(addr) ||
+				    is_kernel_text(addr2))
+					prot = PAGE_KERNEL_LARGE_EXEC;
+
+				set_pmd(pmd, pfn_pmd(pfn, prot));
 
 				pfn += PTRS_PER_PTE;
-			} else {
-				pte = one_page_table_init(pmd);
-
-				for (pte_ofs = 0;
-				     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
-				     pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
-					if (is_kernel_text(address))
-						set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-					else
-						set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
-				}
+				continue;
+			}
+			pte = one_page_table_init(pmd);
+
+			for (pte_ofs = 0;
+			     pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
+			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+				pgprot_t prot = PAGE_KERNEL;
+
+				if (is_kernel_text(addr))
+					prot = PAGE_KERNEL_EXEC;
+
+				set_pte(pte, pfn_pte(pfn, prot));
 			}
 		}
 	}
@@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr)
 	return 0;
 }
 
-int page_is_ram(unsigned long pagenr)
-{
-	int i;
-	unsigned long addr, end;
-
-	if (efi_enabled) {
-		efi_memory_desc_t *md;
-		void *p;
-
-		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
-			md = p;
-			if (!is_available_memory(md))
-				continue;
-			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-			end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
-
-			if ((pagenr >= addr) && (pagenr < end))
-				return 1;
-		}
-		return 0;
-	}
-
-	for (i = 0; i < e820.nr_map; i++) {
-
-		if (e820.map[i].type != E820_RAM)	/* not usable memory */
-			continue;
-		/*
-		 *	!!!FIXME!!! Some BIOSen report areas as RAM that
-		 *	are not. Notably the 640->1Mb area. We need a sanity
-		 *	check here.
-		 */
-		addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
-		end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
-		if  ((pagenr >= addr) && (pagenr < end))
-			return 1;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_HIGHMEM
 pte_t *kmap_pte;
 pgprot_t kmap_prot;
 
-#define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr))
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}
 
 static void __init kmap_init(void)
 {
 	unsigned long kmap_vstart;
 
-	/* cache the first kmap pte */
+	/*
+	 * Cache the first kmap pte:
+	 */
 	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
 	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
 
@@ -259,11 +241,11 @@ static void __init kmap_init(void)
 
 static void __init permanent_kmaps_init(pgd_t *pgd_base)
 {
+	unsigned long vaddr;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	unsigned long vaddr;
 
 	vaddr = PKMAP_BASE;
 	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
@@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pud = pud_offset(pgd, vaddr);
 	pmd = pmd_offset(pud, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;	
+	pkmap_page_table = pte;
 }
 
 static void __meminit free_new_highpage(struct page *page)
@@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 		SetPageReserved(page);
 }
 
-static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
+static int __meminit
+add_one_highpage_hotplug(struct page *page, unsigned long pfn)
 {
 	free_new_highpage(page);
 	totalram_pages++;
@@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
 	max_mapnr = max(pfn, max_mapnr);
 #endif
 	num_physpages++;
+
 	return 0;
 }
 
@@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p
  * Not currently handling the NUMA case.
  * Assuming single node and all memory that
  * has been added dynamically that would be
- * onlined here is in HIGHMEM
+ * onlined here is in HIGHMEM.
  */
 void __meminit online_page(struct page *page)
 {
@@ -314,13 +298,11 @@ void __meminit online_page(struct page *page)
 	add_one_highpage_hotplug(page, page_to_pfn(page));
 }
 
-
-#ifdef CONFIG_NUMA
-extern void set_highmem_pages_init(int);
-#else
+#ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
 	int pfn;
+
 	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
 		/*
 		 * Holes under sparsemem might not have no mem_map[]:
@@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro)
 	}
 	totalram_pages += totalhigh_pages;
 }
-#endif /* CONFIG_FLATMEM */
+#endif /* !CONFIG_NUMA */
 
 #else
-#define kmap_init() do { } while (0)
-#define permanent_kmaps_init(pgd_base) do { } while (0)
-#define set_highmem_pages_init(bad_ppro) do { } while (0)
+# define kmap_init()				do { } while (0)
+# define permanent_kmaps_init(pgd_base)		do { } while (0)
+# define set_highmem_pages_init(bad_ppro)	do { } while (0)
 #endif /* CONFIG_HIGHMEM */
 
-unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+pteval_t __PAGE_KERNEL = _PAGE_KERNEL;
 EXPORT_SYMBOL(__PAGE_KERNEL);
-unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
 
-#ifdef CONFIG_NUMA
-extern void __init remap_numa_kva(void);
-#else
-#define remap_numa_kva() do {} while (0)
-#endif
+pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
 
 void __init native_pagetable_setup_start(pgd_t *base)
 {
@@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
 	memset(&base[USER_PTRS_PER_PGD], 0,
 	       KERNEL_PGD_PTRS * sizeof(pgd_t));
 #else
-	paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
+	paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
 #endif
 }
 
@@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init pagetable_init (void)
+static void __init pagetable_init(void)
 {
-	unsigned long vaddr, end;
 	pgd_t *pgd_base = swapper_pg_dir;
+	unsigned long vaddr, end;
 
 	paravirt_pagetable_setup_start(pgd_base);
 
@@ -435,9 +412,11 @@ static void __init pagetable_init (void)
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
+	early_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
+	early_ioremap_reset();
 
 	permanent_kmaps_init(pgd_base);
 
@@ -450,7 +429,7 @@ static void __init pagetable_init (void)
  * driver might have split up a kernel 4MB mapping.
  */
 char __nosavedata swsusp_pg_dir[PAGE_SIZE]
-	__attribute__ ((aligned (PAGE_SIZE)));
+	__attribute__ ((aligned(PAGE_SIZE)));
 
 static inline void save_pg_dir(void)
 {
@@ -462,7 +441,7 @@ static inline void save_pg_dir(void)
 }
 #endif
 
-void zap_low_mappings (void)
+void zap_low_mappings(void)
 {
 	int i;
 
@@ -474,22 +453,24 @@ void zap_low_mappings (void)
 	 * Note that "pgd_clear()" doesn't do it for
 	 * us, because pgd_clear() is a no-op on i386.
 	 */
-	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+	for (i = 0; i < USER_PTRS_PER_PGD; i++) {
 #ifdef CONFIG_X86_PAE
 		set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
 		set_pgd(swapper_pg_dir+i, __pgd(0));
 #endif
+	}
 	flush_tlb_all();
 }
 
-int nx_enabled = 0;
+int nx_enabled;
+
+pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 #ifdef CONFIG_X86_PAE
 
-static int disable_nx __initdata = 0;
-u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
+static int disable_nx __initdata;
 
 /*
  * noexec = on|off
@@ -506,11 +487,14 @@ static int __init noexec_setup(char *str)
 			__supported_pte_mask |= _PAGE_NX;
 			disable_nx = 0;
 		}
-	} else if (!strcmp(str,"off")) {
-		disable_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	} else
-		return -EINVAL;
+	} else {
+		if (!strcmp(str, "off")) {
+			disable_nx = 1;
+			__supported_pte_mask &= ~_PAGE_NX;
+		} else {
+			return -EINVAL;
+		}
+	}
 
 	return 0;
 }
@@ -522,6 +506,7 @@ static void __init set_nx(void)
 
 	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
 		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
 		if ((v[3] & (1 << 20)) && !disable_nx) {
 			rdmsr(MSR_EFER, l, h);
 			l |= EFER_NX;
@@ -531,35 +516,6 @@ static void __init set_nx(void)
 		}
 	}
 }
-
-/*
- * Enables/disables executability of a given kernel page and
- * returns the previous setting.
- */
-int __init set_kernel_exec(unsigned long vaddr, int enable)
-{
-	pte_t *pte;
-	int ret = 1;
-
-	if (!nx_enabled)
-		goto out;
-
-	pte = lookup_address(vaddr);
-	BUG_ON(!pte);
-
-	if (!pte_exec_kernel(*pte))
-		ret = 0;
-
-	if (enable)
-		pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
-	else
-		pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
-	pte_update_defer(&init_mm, vaddr, pte);
-	__flush_tlb_all();
-out:
-	return ret;
-}
-
 #endif
 
 /*
@@ -574,9 +530,8 @@ void __init paging_init(void)
 #ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
-		printk("NX (Execute Disable) protection: active\n");
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
 #endif
-
 	pagetable_init();
 
 	load_cr3(swapper_pg_dir);
@@ -600,10 +555,10 @@ void __init paging_init(void)
  * used to involve black magic jumps to work around some nasty CPU bugs,
  * but fortunately the switch to using exceptions got rid of all that.
  */
-
 static void __init test_wp_bit(void)
 {
-	printk("Checking if this processor honours the WP bit even in supervisor mode... ");
+	printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");
 
 	/* Any page-aligned address will do, the test is non-destructive */
 	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
@@ -611,47 +566,46 @@ static void __init test_wp_bit(void)
 	clear_fixmap(FIX_WP_TEST);
 
 	if (!boot_cpu_data.wp_works_ok) {
-		printk("No.\n");
+		printk(KERN_CONT "No.\n");
 #ifdef CONFIG_X86_WP_WORKS_OK
-		panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+		panic(
+  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
 #endif
 	} else {
-		printk("Ok.\n");
+		printk(KERN_CONT "Ok.\n");
 	}
 }
 
-static struct kcore_list kcore_mem, kcore_vmalloc; 
+static struct kcore_list kcore_mem, kcore_vmalloc;
 
 void __init mem_init(void)
 {
-	extern int ppro_with_ram_bug(void);
 	int codesize, reservedpages, datasize, initsize;
-	int tmp;
-	int bad_ppro;
+	int tmp, bad_ppro;
 
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
-	
 	bad_ppro = ppro_with_ram_bug();
 
 #ifdef CONFIG_HIGHMEM
 	/* check that fixmap and pkmap do not overlap */
-	if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
-		printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
+	if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+		printk(KERN_ERR
+			"fixmap and kmap areas overlap - this will crash\n");
 		printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
-				PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
+				PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE,
+				FIXADDR_START);
 		BUG();
 	}
 #endif
- 
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
 	reservedpages = 0;
 	for (tmp = 0; tmp < max_low_pfn; tmp++)
 		/*
-		 * Only count reserved RAM pages
+		 * Only count reserved RAM pages:
 		 */
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
@@ -662,11 +616,12 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		   VMALLOC_END-VMALLOC_START);
 
-	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
+	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+			"%dk reserved, %dk data, %dk init, %ldk highmem)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
@@ -677,45 +632,46 @@ void __init mem_init(void)
 	       );
 
 #if 1 /* double-sanity-check paranoia */
-	printk("virtual kernel memory layout:\n"
-	       "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+	printk(KERN_INFO "virtual kernel memory layout:\n"
+		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
-	       "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
-	       "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-	       "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
-	       "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-	       "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-	       "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
-	       FIXADDR_START, FIXADDR_TOP,
-	       (FIXADDR_TOP - FIXADDR_START) >> 10,
+		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+		FIXADDR_START, FIXADDR_TOP,
+		(FIXADDR_TOP - FIXADDR_START) >> 10,
 
 #ifdef CONFIG_HIGHMEM
-	       PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-	       (LAST_PKMAP*PAGE_SIZE) >> 10,
+		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+		(LAST_PKMAP*PAGE_SIZE) >> 10,
 #endif
 
-	       VMALLOC_START, VMALLOC_END,
-	       (VMALLOC_END - VMALLOC_START) >> 20,
+		VMALLOC_START, VMALLOC_END,
+		(VMALLOC_END - VMALLOC_START) >> 20,
 
-	       (unsigned long)__va(0), (unsigned long)high_memory,
-	       ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+		(unsigned long)__va(0), (unsigned long)high_memory,
+		((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
 
-	       (unsigned long)&__init_begin, (unsigned long)&__init_end,
-	       ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
+		(unsigned long)&__init_begin, (unsigned long)&__init_end,
+		((unsigned long)&__init_end -
+		 (unsigned long)&__init_begin) >> 10,
 
-	       (unsigned long)&_etext, (unsigned long)&_edata,
-	       ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+		(unsigned long)&_etext, (unsigned long)&_edata,
+		((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
 
-	       (unsigned long)&_text, (unsigned long)&_etext,
-	       ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+		(unsigned long)&_text, (unsigned long)&_etext,
+		((unsigned long)&_etext - (unsigned long)&_text) >> 10);
 
 #ifdef CONFIG_HIGHMEM
-	BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
-	BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUG_ON(VMALLOC_END				> PKMAP_BASE);
 #endif
-	BUG_ON(VMALLOC_START                   > VMALLOC_END);
-	BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+	BUG_ON(VMALLOC_START				> VMALLOC_END);
+	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
 #endif /* double-sanity-check paranoia */
 
 #ifdef CONFIG_X86_PAE
@@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return __add_pages(zone, start_pfn, nr_pages);
 }
-
 #endif
 
-struct kmem_cache *pmd_cache;
-
-void __init pgtable_cache_init(void)
-{
-	if (PTRS_PER_PMD > 1)
-		pmd_cache = kmem_cache_create("pmd",
-					      PTRS_PER_PMD*sizeof(pmd_t),
-					      PTRS_PER_PMD*sizeof(pmd_t),
-					      SLAB_PANIC,
-					      pmd_ctor);
-}
-
 /*
  * This function cannot be __init, since exceptions don't work in that
  * section.  Put this after the callers, so that it cannot be inlined.
  */
-static int noinline do_test_wp_bit(void)
+static noinline int do_test_wp_bit(void)
 {
 	char tmp_reg;
 	int flag;
 
 	__asm__ __volatile__(
-		"	movb %0,%1	\n"
-		"1:	movb %1,%0	\n"
-		"	xorl %2,%2	\n"
+		"	movb %0, %1	\n"
+		"1:	movb %1, %0	\n"
+		"	xorl %2, %2	\n"
 		"2:			\n"
-		".section __ex_table,\"a\"\n"
+		".section __ex_table, \"a\"\n"
 		"	.align 4	\n"
-		"	.long 1b,2b	\n"
+		"	.long 1b, 2b	\n"
 		".previous		\n"
 		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
 		 "=q" (tmp_reg),
 		 "=r" (flag)
 		:"2" (1)
 		:"memory");
-	
+
 	return flag;
 }
 
 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void mark_rodata_ro(void)
 {
@@ -801,32 +746,58 @@ void mark_rodata_ro(void)
 	if (num_possible_cpus() <= 1)
 #endif
 	{
-		change_page_attr(virt_to_page(start),
-		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-		printk("Write protecting the kernel text: %luk\n", size >> 10);
+		set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+		printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+			size >> 10);
+
+#ifdef CONFIG_CPA_DEBUG
+		printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+			start, start+size);
+		set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+		printk(KERN_INFO "Testing CPA: write protecting again\n");
+		set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
 	}
 #endif
 	start += size;
 	size = (unsigned long)__end_rodata - start;
-	change_page_attr(virt_to_page(start),
-	                 size >> PAGE_SHIFT, PAGE_KERNEL_RO);
-	printk("Write protecting the kernel read-only data: %luk\n",
-	       size >> 10);
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+		size >> 10);
+	rodata_test();
 
-	/*
-	 * change_page_attr() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
-	 */
-	global_flush_tlb();
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
 }
 #endif
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
 	unsigned long addr;
 
+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable first.
+	 */
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
@@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 		totalram_pages++;
 	}
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+#endif
 }
 
 void free_initmem(void)
@@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, end);
 }
 #endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f9c8c89065..cc50a13ce8d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -43,12 +43,10 @@
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
 
-#ifndef Dprintk
-#define Dprintk(x...)
-#endif
-
-const struct dma_mapping_ops* dma_ops;
+const struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static unsigned long dma_reserve __initdata;
@@ -65,22 +63,26 @@ void show_mem(void)
 {
 	long i, total = 0, reserved = 0;
 	long shared = 0, cached = 0;
-	pg_data_t *pgdat;
 	struct page *page;
+	pg_data_t *pgdat;
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
-	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+	printk(KERN_INFO "Free swap:       %6ldkB\n",
+		nr_swap_pages << (PAGE_SHIFT-10));
 
 	for_each_online_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			/* this loop can take a while with 256 GB and 4k pages
-			   so update the NMI watchdog */
-			if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
+		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+			/*
+			 * This loop can take a while with 256 GB and
+			 * 4k pages so defer the NMI watchdog:
+			 */
+			if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
 				touch_nmi_watchdog();
-			}
+
 			if (!pfn_valid(pgdat->node_start_pfn + i))
 				continue;
+
 			page = pfn_to_page(pgdat->node_start_pfn + i);
 			total++;
 			if (PageReserved(page))
@@ -89,51 +91,58 @@ void show_mem(void)
 				cached++;
 			else if (page_count(page))
 				shared += page_count(page) - 1;
-               }
+		}
 	}
-	printk(KERN_INFO "%lu pages of RAM\n", total);
-	printk(KERN_INFO "%lu reserved pages\n",reserved);
-	printk(KERN_INFO "%lu pages shared\n",shared);
-	printk(KERN_INFO "%lu pages swap cached\n",cached);
+	printk(KERN_INFO "%lu pages of RAM\n",		total);
+	printk(KERN_INFO "%lu reserved pages\n",	reserved);
+	printk(KERN_INFO "%lu pages shared\n",		shared);
+	printk(KERN_INFO "%lu pages swap cached\n",	cached);
 }
 
 int after_bootmem;
 
 static __init void *spp_getpage(void)
-{ 
+{
 	void *ptr;
+
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC); 
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
-	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
-		panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
 
-	Dprintk("spp_getpage %p\n", ptr);
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+		panic("set_pte_phys: cannot allocate page data %s\n",
+			after_bootmem ? "after bootmem" : "");
+	}
+
+	pr_debug("spp_getpage %p\n", ptr);
+
 	return ptr;
-} 
+}
 
-static __init void set_pte_phys(unsigned long vaddr,
-			 unsigned long phys, pgprot_t prot)
+static __init void
+set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte, new_pte;
 
-	Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
+	pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys);
 
 	pgd = pgd_offset_k(vaddr);
 	if (pgd_none(*pgd)) {
-		printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		printk(KERN_ERR
+			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
 		return;
 	}
 	pud = pud_offset(pgd, vaddr);
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage(); 
+		pmd = (pmd_t *) spp_getpage();
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
 		if (pmd != pmd_offset(pud, 0)) {
-			printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+				pmd, pmd_offset(pud, 0));
 			return;
 		}
 	}
@@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr,
 		pte = (pte_t *) spp_getpage();
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
 		if (pte != pte_offset_kernel(pmd, 0)) {
-			printk("PAGETABLE BUG #02!\n");
+			printk(KERN_ERR "PAGETABLE BUG #02!\n");
 			return;
 		}
 	}
@@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr,
 }
 
 /* NOTE: this is meant to be run only at boot */
-void __init 
-__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init
+__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
 {
 	unsigned long address = __fix_to_virt(idx);
 
 	if (idx >= __end_of_fixed_addresses) {
-		printk("Invalid __set_fixmap\n");
+		printk(KERN_ERR "Invalid __set_fixmap\n");
 		return;
 	}
 	set_pte_phys(address, phys, prot);
 }
 
-unsigned long __meminitdata table_start, table_end;
+static unsigned long __initdata table_start;
+static unsigned long __meminitdata table_end;
 
 static __meminit void *alloc_low_page(unsigned long *phys)
-{ 
+{
 	unsigned long pfn = table_end++;
 	void *adr;
 
 	if (after_bootmem) {
 		adr = (void *)get_zeroed_page(GFP_ATOMIC);
 		*phys = __pa(adr);
+
 		return adr;
 	}
 
-	if (pfn >= end_pfn) 
-		panic("alloc_low_page: ran out of memory"); 
+	if (pfn >= end_pfn)
+		panic("alloc_low_page: ran out of memory");
 
 	adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
 	memset(adr, 0, PAGE_SIZE);
@@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys)
 }
 
 static __meminit void unmap_low_page(void *adr)
-{ 
-
+{
 	if (after_bootmem)
 		return;
 
 	early_iounmap(adr, PAGE_SIZE);
-} 
+}
 
 /* Must run before zap_low_mappings */
 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
 {
-	unsigned long vaddr;
 	pmd_t *pmd, *last_pmd;
+	unsigned long vaddr;
 	int i, pmds;
 
 	pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
 	vaddr = __START_KERNEL_map;
 	pmd = level2_kernel_pgt;
 	last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
+
 	for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
 		for (i = 0; i < pmds; i++) {
 			if (pmd_present(pmd[i]))
-				goto next;
+				goto continue_outer_loop;
 		}
 		vaddr += addr & ~PMD_MASK;
 		addr &= PMD_MASK;
+
 		for (i = 0; i < pmds; i++, addr += PMD_SIZE)
-			set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
-		__flush_tlb();
+			set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+		__flush_tlb_all();
+
 		return (void *)vaddr;
-	next:
+continue_outer_loop:
 		;
 	}
-	printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
+	printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size);
+
 	return NULL;
 }
 
-/* To avoid virtual aliases later */
+/*
+ * To avoid virtual aliases later:
+ */
 __meminit void early_iounmap(void *addr, unsigned long size)
 {
 	unsigned long vaddr;
@@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size)
 	vaddr = (unsigned long)addr;
 	pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
 	pmd = level2_kernel_pgt + pmd_index(vaddr);
+
 	for (i = 0; i < pmds; i++)
 		pmd_clear(pmd + i);
-	__flush_tlb();
+
+	__flush_tlb_all();
 }
 
 static void __meminit
@@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 		pmd_t *pmd = pmd_page + pmd_index(address);
 
 		if (address >= end) {
-			if (!after_bootmem)
+			if (!after_bootmem) {
 				for (; i < PTRS_PER_PMD; i++, pmd++)
 					set_pmd(pmd, __pmd(0));
+			}
 			break;
 		}
 
 		if (pmd_val(*pmd))
 			continue;
 
-		entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+		entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
 		entry &= __supported_pte_mask;
 		set_pmd(pmd, __pmd(entry));
 	}
@@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
 static void __meminit
 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
 {
-	pmd_t *pmd = pmd_offset(pud,0);
+	pmd_t *pmd = pmd_offset(pud, 0);
 	spin_lock(&init_mm.page_table_lock);
 	phys_pmd_init(pmd, address, end);
 	spin_unlock(&init_mm.page_table_lock);
 	__flush_tlb_all();
 }
 
-static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
-{ 
+static void __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
+{
 	int i = pud_index(addr);
 
-
-	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
+	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
 		unsigned long pmd_phys;
 		pud_t *pud = pud_page + pud_index(addr);
 		pmd_t *pmd;
@@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
 		if (addr >= end)
 			break;
 
-		if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
-			set_pud(pud, __pud(0)); 
+		if (!after_bootmem &&
+				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
+			set_pud(pud, __pud(0));
 			continue;
-		} 
+		}
 
 		if (pud_val(*pud)) {
 			phys_pmd_update(pud, addr, end);
@@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
 		}
 
 		pmd = alloc_low_page(&pmd_phys);
+
 		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
 		phys_pmd_init(pmd, addr, end);
 		spin_unlock(&init_mm.page_table_lock);
+
 		unmap_low_page(pmd);
 	}
-	__flush_tlb();
-} 
+	__flush_tlb_all();
+}
 
 static void __init find_early_table_space(unsigned long end)
 {
@@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end)
 	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
 		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
 
- 	/* RED-PEN putting page tables only on node 0 could
- 	   cause a hotspot and fill up ZONE_DMA. The page tables
- 	   need roughly 0.5KB per GB. */
- 	start = 0x8000;
- 	table_start = find_e820_area(start, end, tables);
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+	start = 0x8000;
+	table_start = find_e820_area(start, end, tables);
 	if (table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
+	/*
+	 * When you have a lot of RAM like 256GB, early_table will not fit
+	 * into 0x8000 range, find_e820_area() will find area after kernel
+	 * bss but the table_start is not page aligned, so need to round it
+	 * up to avoid overlap with bss:
+	 */
+	table_start = round_up(table_start, PAGE_SIZE);
 	table_start >>= PAGE_SHIFT;
 	table_end = table_start;
 
@@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end)
 		(table_start << PAGE_SHIFT) + tables);
 }
 
-/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
-   This runs before bootmem is initialized and gets pages directly from the 
-   physical memory. To access them they are temporarily mapped. */
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
-{ 
-	unsigned long next; 
+{
+	unsigned long next;
 
-	Dprintk("init_memory_mapping\n");
+	pr_debug("init_memory_mapping\n");
 
-	/* 
+	/*
 	 * Find space for the kernel direct mapping tables.
-	 * Later we should allocate these tables in the local node of the memory
-	 * mapped.  Unfortunately this is done currently before the nodes are 
-	 * discovered.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
 	 */
 	if (!after_bootmem)
 		find_early_table_space(end);
@@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 	end = (unsigned long)__va(end);
 
 	for (; start < end; start = next) {
-		unsigned long pud_phys; 
 		pgd_t *pgd = pgd_offset_k(start);
+		unsigned long pud_phys;
 		pud_t *pud;
 
 		if (after_bootmem)
@@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
 			pud = alloc_low_page(&pud_phys);
 
 		next = start + PGDIR_SIZE;
-		if (next > end) 
-			next = end; 
+		if (next > end)
+			next = end;
 		phys_pud_init(pud, __pa(start), __pa(next));
 		if (!after_bootmem)
 			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 		unmap_low_page(pud);
-	} 
+	}
 
 	if (!after_bootmem)
 		mmu_cr4_features = read_cr4();
 	__flush_tlb_all();
+
+	reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
 }
 
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -402,39 +439,48 @@ void __init paging_init(void)
 }
 #endif
 
-/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
-   from the CPU leading to inconsistent cache lines. address and size
-   must be aligned to 2MB boundaries. 
-   Does nothing when the mapping doesn't exist. */
-void __init clear_kernel_mapping(unsigned long address, unsigned long size) 
+/*
+ * Unmap a kernel mapping if it exists. This is useful to avoid
+ * prefetches from the CPU leading to inconsistent cache lines.
+ * address and size must be aligned to 2MB boundaries.
+ * Does nothing when the mapping doesn't exist.
+ */
+void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 {
 	unsigned long end = address + size;
 
 	BUG_ON(address & ~LARGE_PAGE_MASK);
-	BUG_ON(size & ~LARGE_PAGE_MASK); 
-	
-	for (; address < end; address += LARGE_PAGE_SIZE) { 
+	BUG_ON(size & ~LARGE_PAGE_MASK);
+
+	for (; address < end; address += LARGE_PAGE_SIZE) {
 		pgd_t *pgd = pgd_offset_k(address);
 		pud_t *pud;
 		pmd_t *pmd;
+
 		if (pgd_none(*pgd))
 			continue;
+
 		pud = pud_offset(pgd, address);
 		if (pud_none(*pud))
-			continue; 
+			continue;
+
 		pmd = pmd_offset(pud, address);
 		if (!pmd || pmd_none(*pmd))
-			continue; 
-		if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { 
-			/* Could handle this, but it should not happen currently. */
-			printk(KERN_ERR 
-	       "clear_kernel_mapping: mapping has been split. will leak memory\n"); 
-			pmd_ERROR(*pmd); 
+			continue;
+
+		if (!(pmd_val(*pmd) & _PAGE_PSE)) {
+			/*
+			 * Could handle this, but it should not happen
+			 * currently:
+			 */
+			printk(KERN_ERR "clear_kernel_mapping: "
+				"mapping has been split. will leak memory\n");
+			pmd_ERROR(*pmd);
 		}
-		set_pmd(pmd, __pmd(0)); 		
+		set_pmd(pmd, __pmd(0));
 	}
 	__flush_tlb_all();
-} 
+}
 
 /*
  * Memory hotplug specific functions
@@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	init_memory_mapping(start, (start + size -1));
+	init_memory_mapping(start, start + size-1);
 
 	ret = __add_pages(zone, start_pfn, nr_pages);
-	if (ret)
-		goto error;
+	WARN_ON(1);
 
 	return ret;
-error:
-	printk("%s: Problem encountered in __add_pages!\n", __func__);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
@@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
- */
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
-{
-	int err = -EIO;
-	unsigned long pfn;
-	unsigned long total = 0, mem = 0;
-	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
-		if (pfn_valid(pfn)) {
-			online_page(pfn_to_page(pfn));
-			err = 0;
-			mem++;
-		}
-		total++;
-	}
-	if (!err) {
-		z->spanned_pages += total;
-		z->present_pages += mem;
-		z->zone_pgdat->node_spanned_pages += total;
-		z->zone_pgdat->node_present_pages += mem;
-	}
-	return err;
-}
-#endif
-
-static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
-			 kcore_vsyscall;
+static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
+			 kcore_modules, kcore_vsyscall;
 
 void __init mem_init(void)
 {
@@ -521,8 +535,15 @@ void __init mem_init(void)
 
 	pci_iommu_alloc();
 
-	/* clear the zero-page */
-	memset(empty_zero_page, 0, PAGE_SIZE);
+	/* clear_bss() already clear the empty_zero_page */
+
+	/* temporary debugging - double check it's true: */
+	{
+		int i;
+
+		for (i = 0; i < 1024; i++)
+			WARN_ON_ONCE(empty_zero_page[i]);
+	}
 
 	reservedpages = 0;
 
@@ -534,7 +555,6 @@ void __init mem_init(void)
 #endif
 	reservedpages = end_pfn - totalram_pages -
 					absent_pages_in_range(0, end_pfn);
-
 	after_bootmem = 1;
 
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
@@ -542,15 +562,16 @@ void __init mem_init(void)
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
 	/* Register memory areas for /proc/kcore */
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
+	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
 		   VMALLOC_END-VMALLOC_START);
 	kclist_add(&kcore_kernel, &_stext, _end - _stext);
 	kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
-	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, 
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
 				 VSYSCALL_END - VSYSCALL_START);
 
-	printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
+	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+				"%ldk reserved, %ldk data, %ldk init)\n",
 		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
 		end_pfn << (PAGE_SHIFT-10),
 		codesize >> 10,
@@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	if (begin >= end)
 		return;
 
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+		begin, PAGE_ALIGN(end));
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
 	printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)),
 			POISON_FREE_INITMEM, PAGE_SIZE);
-		if (addr >= __START_KERNEL_map)
-			change_page_attr_addr(addr, 1, __pgprot(0));
 		free_page(addr);
 		totalram_pages++;
 	}
-	if (addr > __START_KERNEL_map)
-		global_flush_tlb();
+#endif
 }
 
 void free_initmem(void)
@@ -589,6 +618,8 @@ void free_initmem(void)
 }
 
 #ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void mark_rodata_ro(void)
 {
@@ -603,25 +634,27 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_KPROBES
 	start = (unsigned long)__start_rodata;
 #endif
-	
+
 	end = (unsigned long)__end_rodata;
 	start = (start + PAGE_SIZE - 1) & PAGE_MASK;
 	end &= PAGE_MASK;
 	if (end <= start)
 		return;
 
-	change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
 
-	/*
-	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
-	 * We do this after the printk so that if something went wrong in the
-	 * change, the printk gets out at least to give a better debug hint
-	 * of who is the culprit.
-	 */
-	global_flush_tlb();
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: again\n");
+	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
 }
 #endif
 
@@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-void __init reserve_bootmem_generic(unsigned long phys, unsigned len) 
-{ 
+void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
+{
 #ifdef CONFIG_NUMA
 	int nid = phys_to_nid(phys);
 #endif
 	unsigned long pfn = phys >> PAGE_SHIFT;
+
 	if (pfn >= end_pfn) {
-		/* This can happen with kdump kernels when accessing firmware
-		   tables. */
+		/*
+		 * This can happen with kdump kernels when accessing
+		 * firmware tables:
+		 */
 		if (pfn < end_pfn_map)
 			return;
+
 		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
 				phys, len);
 		return;
@@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 
 	/* Should check here against the e820 map to avoid double free */
 #ifdef CONFIG_NUMA
-  	reserve_bootmem_node(NODE_DATA(nid), phys, len);
-#else       		
-	reserve_bootmem(phys, len);    
+	reserve_bootmem_node(NODE_DATA(nid), phys, len);
+#else
+	reserve_bootmem(phys, len);
 #endif
 	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
 		dma_reserve += len / PAGE_SIZE;
@@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 	}
 }
 
-int kern_addr_valid(unsigned long addr) 
-{ 
+int kern_addr_valid(unsigned long addr)
+{
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
 
 	if (above != 0 && above != -1UL)
-		return 0; 
-	
+		return 0;
+
 	pgd = pgd_offset_k(addr);
 	if (pgd_none(*pgd))
 		return 0;
 
 	pud = pud_offset(pgd, addr);
 	if (pud_none(*pud))
-		return 0; 
+		return 0;
 
 	pmd = pmd_offset(pud, addr);
 	if (pmd_none(*pmd))
 		return 0;
+
 	if (pmd_large(*pmd))
 		return pfn_valid(pmd_pfn(*pmd));
 
 	pte = pte_offset_kernel(pmd, addr);
 	if (pte_none(*pte))
 		return 0;
+
 	return pfn_valid(pte_pfn(*pte));
 }
 
-/* A pseudo VMA to allow ptrace access for the vsyscall page.  This only
-   covers the 64bit vsyscall page now. 32bit has a real VMA now and does
-   not need special handling anymore. */
-
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
 static struct vm_area_struct gate_vma = {
-	.vm_start = VSYSCALL_START,
-	.vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
-	.vm_page_prot = PAGE_READONLY_EXEC,
-	.vm_flags = VM_READ | VM_EXEC
+	.vm_start	= VSYSCALL_START,
+	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+	.vm_page_prot	= PAGE_READONLY_EXEC,
+	.vm_flags	= VM_READ | VM_EXEC
 };
 
 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
@@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 int in_gate_area(struct task_struct *task, unsigned long addr)
 {
 	struct vm_area_struct *vma = get_gate_vma(task);
+
 	if (!vma)
 		return 0;
+
 	return (addr >= vma->vm_start) && (addr < vma->vm_end);
 }
 
-/* Use this when you have no reliable task/vma, typically from interrupt
- * context.  It is less reliable than using the task's vma and may give
- * false positives.
+/*
+ * Use this when you have no reliable task/vma, typically from interrupt
+ * context. It is less reliable than using the task's vma and may give
+ * false positives:
  */
 int in_gate_area_no_task(unsigned long addr)
 {
@@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
  */
-int __meminit vmemmap_populate(struct page *start_page,
-						unsigned long size, int node)
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
 {
 	unsigned long addr = (unsigned long)start_page;
 	unsigned long end = (unsigned long)(start_page + size);
@@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page,
 		pgd = vmemmap_pgd_populate(addr, node);
 		if (!pgd)
 			return -ENOMEM;
+
 		pud = vmemmap_pud_populate(pgd, addr, node);
 		if (!pud)
 			return -ENOMEM;
@@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page,
 		pmd = pmd_offset(pud, addr);
 		if (pmd_none(*pmd)) {
 			pte_t entry;
-			void *p = vmemmap_alloc_block(PMD_SIZE, node);
+			void *p;
+
+			p = vmemmap_alloc_block(PMD_SIZE, node);
 			if (!p)
 				return -ENOMEM;
 
-			entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
-			mk_pte_huge(entry);
+			entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+							PAGE_KERNEL_LARGE);
 			set_pmd(pmd, __pmd(pte_val(entry)));
 
 			printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
 				addr, addr + PMD_SIZE - 1, p, node);
-		} else
+		} else {
 			vmemmap_verify((pte_t *)pmd, node, addr, next);
+		}
 	}
-
 	return 0;
 }
 #endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 00000000000..ed795721ca8
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,501 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+enum ioremap_mode {
+	IOR_MODE_UNCACHED,
+	IOR_MODE_CACHED,
+};
+
+#ifdef CONFIG_X86_64
+
+unsigned long __phys_addr(unsigned long x)
+{
+	if (x >= __START_KERNEL_map)
+		return x - __START_KERNEL_map + phys_base;
+	return x - PAGE_OFFSET;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+#endif
+
+int page_is_ram(unsigned long pagenr)
+{
+	unsigned long addr, end;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		/*
+		 * Not usable memory:
+		 */
+		if (e820.map[i].type != E820_RAM)
+			continue;
+		addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
+		end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
+
+		/*
+		 * Sanity check: Some BIOSen report areas as RAM that
+		 * are not. Notably the 640->1Mb area, which is the
+		 * PCI BIOS area.
+		 */
+		if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
+		    end < (BIOS_END >> PAGE_SHIFT))
+			continue;
+
+		if ((pagenr >= addr) && (pagenr < end))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long paddr, unsigned long size,
+			       enum ioremap_mode mode)
+{
+	unsigned long vaddr = (unsigned long)__va(paddr);
+	unsigned long nrpages = size >> PAGE_SHIFT;
+	int err, level;
+
+	/* No change for pages after the last mapping */
+	if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT))
+		return 0;
+
+	/*
+	 * If there is no identity map for this address,
+	 * change_page_attr_addr is unnecessary
+	 */
+	if (!lookup_address(vaddr, &level))
+		return 0;
+
+	switch (mode) {
+	case IOR_MODE_UNCACHED:
+	default:
+		err = set_memory_uc(vaddr, nrpages);
+		break;
+	case IOR_MODE_CACHED:
+		err = set_memory_wb(vaddr, nrpages);
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
+			       enum ioremap_mode mode)
+{
+	void __iomem *addr;
+	struct vm_struct *area;
+	unsigned long offset, last_addr;
+	pgprot_t prot;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
+		return (__force void __iomem *)phys_to_virt(phys_addr);
+
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped &&
+	     (offset << PAGE_SHIFT) < last_addr; offset++) {
+		if (page_is_ram(offset))
+			return NULL;
+	}
+
+	switch (mode) {
+	case IOR_MODE_UNCACHED:
+	default:
+		prot = PAGE_KERNEL_NOCACHE;
+		break;
+	case IOR_MODE_CACHED:
+		prot = PAGE_KERNEL;
+		break;
+	}
+
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area(size, VM_IOREMAP);
+	if (!area)
+		return NULL;
+	area->phys_addr = phys_addr;
+	addr = (void __iomem *) area->addr;
+	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
+			       phys_addr, prot)) {
+		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+		return NULL;
+	}
+
+	if (ioremap_change_attr(phys_addr, size, mode) < 0) {
+		vunmap(addr);
+		return NULL;
+	}
+
+	return (void __iomem *) (offset + (char __iomem *)addr);
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size)
+{
+	return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, *o;
+
+	if ((void __force *)addr <= high_memory)
+		return;
+
+	/*
+	 * __ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
+	    addr < phys_to_virt(ISA_END_ADDRESS))
+		return;
+
+	addr = (volatile void __iomem *)
+		(PAGE_MASK & (unsigned long __force)addr);
+
+	/* Use the vm area unlocked, assuming the caller
+	   ensures there isn't another iounmap for the same address
+	   in parallel. Reuse of the virtual address is prevented by
+	   leaving it in the global lists until we're done with it.
+	   cpa takes care of the direct mappings. */
+	read_lock(&vmlist_lock);
+	for (p = vmlist; p; p = p->next) {
+		if (p->addr == addr)
+			break;
+	}
+	read_unlock(&vmlist_lock);
+
+	if (!p) {
+		printk(KERN_ERR "iounmap: bad address %p\n", addr);
+		dump_stack();
+		return;
+	}
+
+	/* Reset the direct mapping. Can block */
+	ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED);
+
+	/* Finally remove it */
+	o = remove_vm_area((void *)addr);
+	BUG_ON(p != o || o == NULL);
+	kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifdef CONFIG_X86_32
+
+int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+	early_ioremap_debug = 1;
+
+	return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static __initdata unsigned long bm_pte[1024]
+				__attribute__((aligned(PAGE_SIZE)));
+
+static inline unsigned long * __init early_ioremap_pgd(unsigned long addr)
+{
+	return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023);
+}
+
+static inline unsigned long * __init early_ioremap_pte(unsigned long addr)
+{
+	return bm_pte + ((addr >> PAGE_SHIFT) & 1023);
+}
+
+void __init early_ioremap_init(void)
+{
+	unsigned long *pgd;
+
+	if (early_ioremap_debug)
+		printk(KERN_INFO "early_ioremap_init()\n");
+
+	pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	*pgd = __pa(bm_pte) | _PAGE_TABLE;
+	memset(bm_pte, 0, sizeof(bm_pte));
+	/*
+	 * The boot-ioremap range spans multiple pgds, for which
+	 * we are not prepared:
+	 */
+	if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) {
+		WARN_ON(1);
+		printk(KERN_WARNING "pgd %p != %p\n",
+		       pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END)));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+		       fix_to_virt(FIX_BTMAP_BEGIN));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+		       fix_to_virt(FIX_BTMAP_END));
+
+		printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+		printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+		       FIX_BTMAP_BEGIN);
+	}
+}
+
+void __init early_ioremap_clear(void)
+{
+	unsigned long *pgd;
+
+	if (early_ioremap_debug)
+		printk(KERN_INFO "early_ioremap_clear()\n");
+
+	pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN));
+	*pgd = 0;
+	paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT);
+	__flush_tlb_all();
+}
+
+void __init early_ioremap_reset(void)
+{
+	enum fixed_addresses idx;
+	unsigned long *pte, phys, addr;
+
+	after_paging_init = 1;
+	for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
+		addr = fix_to_virt(idx);
+		pte = early_ioremap_pte(addr);
+		if (!*pte & _PAGE_PRESENT) {
+			phys = *pte & PAGE_MASK;
+			set_fixmap(idx, phys);
+		}
+	}
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+				   unsigned long phys, pgprot_t flags)
+{
+	unsigned long *pte, addr = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	pte = early_ioremap_pte(addr);
+	if (pgprot_val(flags))
+		*pte = (phys & PAGE_MASK) | pgprot_val(flags);
+	else
+		*pte = 0;
+	__flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+					unsigned long phys)
+{
+	if (after_paging_init)
+		set_fixmap(idx, phys);
+	else
+		__early_set_fixmap(idx, phys, PAGE_KERNEL);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+	if (after_paging_init)
+		clear_fixmap(idx);
+	else
+		__early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+
+int __initdata early_ioremap_nested;
+
+static int __init check_early_ioremap_leak(void)
+{
+	if (!early_ioremap_nested)
+		return 0;
+
+	printk(KERN_WARNING
+	       "Debug warning: early ioremap leak of %d areas detected.\n",
+	       early_ioremap_nested);
+	printk(KERN_WARNING
+	       "please boot with early_ioremap_debug and report the dmesg.\n");
+	WARN_ON(1);
+
+	return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
+{
+	unsigned long offset, last_addr;
+	unsigned int nrpages, nesting;
+	enum fixed_addresses idx0, idx;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	nesting = early_ioremap_nested;
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
+		       phys_addr, size, nesting);
+		dump_stack();
+	}
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (nesting >= FIX_BTMAPS_NESTING) {
+		WARN_ON(1);
+		return NULL;
+	}
+	early_ioremap_nested++;
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr) - phys_addr;
+
+	/*
+	 * Mappings have to fit in the FIX_BTMAP area.
+	 */
+	nrpages = size >> PAGE_SHIFT;
+	if (nrpages > NR_FIX_BTMAPS) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	/*
+	 * Ok, go for it..
+	 */
+	idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+	idx = idx0;
+	while (nrpages > 0) {
+		early_set_fixmap(idx, phys_addr);
+		phys_addr += PAGE_SIZE;
+		--idx;
+		--nrpages;
+	}
+	if (early_ioremap_debug)
+		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+
+	return (void *) (offset + fix_to_virt(idx0));
+}
+
+void __init early_iounmap(void *addr, unsigned long size)
+{
+	unsigned long virt_addr;
+	unsigned long offset;
+	unsigned int nrpages;
+	enum fixed_addresses idx;
+	unsigned int nesting;
+
+	nesting = --early_ioremap_nested;
+	WARN_ON(nesting < 0);
+
+	if (early_ioremap_debug) {
+		printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+		       size, nesting);
+		dump_stack();
+	}
+
+	virt_addr = (unsigned long)addr;
+	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+		WARN_ON(1);
+		return;
+	}
+	offset = virt_addr & ~PAGE_MASK;
+	nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+	idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting;
+	while (nrpages > 0) {
+		early_clear_fixmap(idx);
+		--idx;
+		--nrpages;
+	}
+}
+
+void __this_fixmap_does_not_exist(void)
+{
+	WARN_ON(1);
+}
+
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c
deleted file mode 100644
index 0b278315d73..00000000000
--- a/arch/x86/mm/ioremap_32.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * arch/i386/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-#include <asm/fixmap.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
-#define ISA_START_ADDRESS	0xa0000
-#define ISA_END_ADDRESS		0x100000
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-	void __iomem * addr;
-	struct vm_struct * area;
-	unsigned long offset, last_addr;
-	pgprot_t prot;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-		return (void __iomem *) phys_to_virt(phys_addr);
-
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using..
-	 */
-	if (phys_addr <= virt_to_phys(high_memory - 1)) {
-		char *t_addr, *t_end;
-		struct page *page;
-
-		t_addr = __va(phys_addr);
-		t_end = t_addr + (size - 1);
-	   
-		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-			if(!PageReserved(page))
-				return NULL;
-	}
-
-	prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY
-			| _PAGE_ACCESSED | flags);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-	/*
-	 * Ok, go for it..
-	 */
-	area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-	if (!area)
-		return NULL;
-	area->phys_addr = phys_addr;
-	addr = (void __iomem *) area->addr;
-	if (ioremap_page_range((unsigned long) addr,
-			(unsigned long) addr + size, phys_addr, prot)) {
-		vunmap((void __force *) addr);
-		return NULL;
-	}
-	return (void __iomem *) (offset + (char __iomem *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-	unsigned long last_addr;
-	void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
-	if (!p) 
-		return p; 
-
-	/* Guaranteed to be > phys_addr, as per __ioremap() */
-	last_addr = phys_addr + size - 1;
-
-	if (last_addr < virt_to_phys(high_memory) - 1) {
-		struct page *ppage = virt_to_page(__va(phys_addr));		
-		unsigned long npages;
-
-		phys_addr &= PAGE_MASK;
-
-		/* This might overflow and become zero.. */
-		last_addr = PAGE_ALIGN(last_addr);
-
-		/* .. but that's ok, because modulo-2**n arithmetic will make
-	 	* the page-aligned "last - first" come out right.
-	 	*/
-		npages = (last_addr - phys_addr) >> PAGE_SHIFT;
-
-		if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
-			iounmap(p); 
-			p = NULL;
-		}
-		global_flush_tlb();
-	}
-
-	return p;					
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-	struct vm_struct *p, *o;
-
-	if ((void __force *)addr <= high_memory)
-		return;
-
-	/*
-	 * __ioremap special-cases the PCI/ISA range by not instantiating a
-	 * vm_area and by simply returning an address into the kernel mapping
-	 * of ISA space.   So handle that here.
-	 */
-	if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-			addr < phys_to_virt(ISA_END_ADDRESS))
-		return;
-
-	addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-
-	/* Use the vm area unlocked, assuming the caller
-	   ensures there isn't another iounmap for the same address
-	   in parallel. Reuse of the virtual address is prevented by
-	   leaving it in the global lists until we're done with it.
-	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
-	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
-			break;
-	}
-	read_unlock(&vmlist_lock);
-
-	if (!p) {
-		printk("iounmap: bad address %p\n", addr);
-		dump_stack();
-		return;
-	}
-
-	/* Reset the direct mapping. Can block */
-	if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
-		change_page_attr(virt_to_page(__va(p->phys_addr)),
-				 get_vm_area_size(p) >> PAGE_SHIFT,
-				 PAGE_KERNEL);
-		global_flush_tlb();
-	} 
-
-	/* Finally remove it */
-	o = remove_vm_area((void *)addr);
-	BUG_ON(p != o || o == NULL);
-	kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
-void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
-{
-	unsigned long offset, last_addr;
-	unsigned int nrpages;
-	enum fixed_addresses idx;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-		return phys_to_virt(phys_addr);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr) - phys_addr;
-
-	/*
-	 * Mappings have to fit in the FIX_BTMAP area.
-	 */
-	nrpages = size >> PAGE_SHIFT;
-	if (nrpages > NR_FIX_BTMAPS)
-		return NULL;
-
-	/*
-	 * Ok, go for it..
-	 */
-	idx = FIX_BTMAP_BEGIN;
-	while (nrpages > 0) {
-		set_fixmap(idx, phys_addr);
-		phys_addr += PAGE_SIZE;
-		--idx;
-		--nrpages;
-	}
-	return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
-}
-
-void __init bt_iounmap(void *addr, unsigned long size)
-{
-	unsigned long virt_addr;
-	unsigned long offset;
-	unsigned int nrpages;
-	enum fixed_addresses idx;
-
-	virt_addr = (unsigned long)addr;
-	if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
-		return;
-	offset = virt_addr & ~PAGE_MASK;
-	nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
-
-	idx = FIX_BTMAP_BEGIN;
-	while (nrpages > 0) {
-		clear_fixmap(idx);
-		--idx;
-		--nrpages;
-	}
-}
diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c
deleted file mode 100644
index 6cac90aa503..00000000000
--- a/arch/x86/mm/ioremap_64.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * arch/x86_64/mm/ioremap.c
- *
- * Re-map IO memory to kernel address space so that we can access it.
- * This is needed for high PCI addresses that aren't mapped in the
- * 640k-1MB IO memory area on PC's
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- */
-
-#include <linux/vmalloc.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/io.h>
-
-#include <asm/pgalloc.h>
-#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-#include <asm/cacheflush.h>
-#include <asm/proto.h>
-
-unsigned long __phys_addr(unsigned long x)
-{
-	if (x >= __START_KERNEL_map)
-		return x - __START_KERNEL_map + phys_base;
-	return x - PAGE_OFFSET;
-}
-EXPORT_SYMBOL(__phys_addr);
-
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
-
-/*
- * Fix up the linear direct mapping of the kernel to avoid cache attribute
- * conflicts.
- */
-static int
-ioremap_change_attr(unsigned long phys_addr, unsigned long size,
-					unsigned long flags)
-{
-	int err = 0;
-	if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
-		unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long vaddr = (unsigned long) __va(phys_addr);
-
-		/*
- 		 * Must use a address here and not struct page because the phys addr
-		 * can be a in hole between nodes and not have an memmap entry.
-		 */
-		err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags));
-		if (!err)
-			global_flush_tlb();
-	}
-	return err;
-}
-
-/*
- * Generic mapping function
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
-{
-	void * addr;
-	struct vm_struct * area;
-	unsigned long offset, last_addr;
-	pgprot_t pgprot;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	/*
-	 * Don't remap the low PCI/ISA area, it's always mapped..
-	 */
-	if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-		return (__force void __iomem *)phys_to_virt(phys_addr);
-
-#ifdef CONFIG_FLATMEM
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using..
-	 */
-	if (last_addr < virt_to_phys(high_memory)) {
-		char *t_addr, *t_end;
- 		struct page *page;
-
-		t_addr = __va(phys_addr);
-		t_end = t_addr + (size - 1);
-	   
-		for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-			if(!PageReserved(page))
-				return NULL;
-	}
-#endif
-
-	pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL
-			  | _PAGE_DIRTY | _PAGE_ACCESSED | flags);
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr+1) - phys_addr;
-
-	/*
-	 * Ok, go for it..
-	 */
-	area = get_vm_area(size, VM_IOREMAP | (flags << 20));
-	if (!area)
-		return NULL;
-	area->phys_addr = phys_addr;
-	addr = area->addr;
-	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-			       phys_addr, pgprot)) {
-		remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
-		return NULL;
-	}
-	if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) {
-		area->flags &= 0xffffff;
-		vunmap(addr);
-		return NULL;
-	}
-	return (__force void __iomem *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-/**
- * ioremap_nocache     -   map bus memory into CPU space
- * @offset:    bus address of the memory
- * @size:      size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address. 
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many 
- * busses. In particular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- * 
- * Must be freed with iounmap.
- */
-
-void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
-{
-	return __ioremap(phys_addr, size, _PAGE_PCD);
-}
-EXPORT_SYMBOL(ioremap_nocache);
-
-/**
- * iounmap - Free a IO remapping
- * @addr: virtual address from ioremap_*
- *
- * Caller must ensure there is only one unmapping for the same pointer.
- */
-void iounmap(volatile void __iomem *addr)
-{
-	struct vm_struct *p, *o;
-
-	if (addr <= high_memory) 
-		return; 
-	if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-		addr < phys_to_virt(ISA_END_ADDRESS))
-		return;
-
-	addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
-	/* Use the vm area unlocked, assuming the caller
-	   ensures there isn't another iounmap for the same address
-	   in parallel. Reuse of the virtual address is prevented by
-	   leaving it in the global lists until we're done with it.
-	   cpa takes care of the direct mappings. */
-	read_lock(&vmlist_lock);
-	for (p = vmlist; p; p = p->next) {
-		if (p->addr == addr)
-			break;
-	}
-	read_unlock(&vmlist_lock);
-
-	if (!p) {
-		printk("iounmap: bad address %p\n", addr);
-		dump_stack();
-		return;
-	}
-
-	/* Reset the direct mapping. Can block */
-	if (p->flags >> 20)
-		ioremap_change_attr(p->phys_addr, p->size, 0);
-
-	/* Finally remove it */
-	o = remove_vm_area((void *)addr);
-	BUG_ON(p != o || o == NULL);
-	kfree(p); 
-}
-EXPORT_SYMBOL(iounmap);
-
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index a96006f7ae0..7a2ebce87df 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -1,9 +1,9 @@
-/* 
+/*
  * AMD K8 NUMA support.
  * Discover the memory map and associated nodes.
- * 
+ *
  * This version reads it directly from the K8 northbridge.
- * 
+ *
  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  */
 #include <linux/kernel.h>
@@ -22,132 +22,135 @@
 
 static __init int find_northbridge(void)
 {
-	int num; 
+	int num;
 
-	for (num = 0; num < 32; num++) { 
+	for (num = 0; num < 32; num++) {
 		u32 header;
-		
-		header = read_pci_config(0, num, 0, 0x00);  
-		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)))
-			continue; 	
-
-		header = read_pci_config(0, num, 1, 0x00); 
-		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)))
-			continue;	
-		return num; 
-	} 
-
-	return -1; 	
+
+		header = read_pci_config(0, num, 0, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+			continue;
+
+		header = read_pci_config(0, num, 1, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+			continue;
+		return num;
+	}
+
+	return -1;
 }
 
 int __init k8_scan_nodes(unsigned long start, unsigned long end)
-{ 
+{
 	unsigned long prevbase;
 	struct bootnode nodes[8];
-	int nodeid, i, j, nb;
+	int nodeid, i, nb;
 	unsigned char nodeids[8];
 	int found = 0;
 	u32 reg;
 	unsigned numnodes;
-	unsigned num_cores;
+	unsigned cores;
+	unsigned bits;
+	int j;
 
 	if (!early_pci_allowed())
 		return -1;
 
-	nb = find_northbridge(); 
-	if (nb < 0) 
+	nb = find_northbridge();
+	if (nb < 0)
 		return nb;
 
-	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 
-
-	num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-	printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+	printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
 
-	reg = read_pci_config(0, nb, 0, 0x60); 
+	reg = read_pci_config(0, nb, 0, 0x60);
 	numnodes = ((reg >> 4) & 0xF) + 1;
 	if (numnodes <= 1)
 		return -1;
 
 	printk(KERN_INFO "Number of nodes %d\n", numnodes);
 
-	memset(&nodes,0,sizeof(nodes)); 
+	memset(&nodes, 0, sizeof(nodes));
 	prevbase = 0;
-	for (i = 0; i < 8; i++) { 
-		unsigned long base,limit; 
+	for (i = 0; i < 8; i++) {
+		unsigned long base, limit;
 		u32 nodeid;
-		
+
 		base = read_pci_config(0, nb, 1, 0x40 + i*8);
 		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
 
-		nodeid = limit & 7; 
+		nodeid = limit & 7;
 		nodeids[i] = nodeid;
-		if ((base & 3) == 0) { 
+		if ((base & 3) == 0) {
 			if (i < numnodes)
-				printk("Skipping disabled node %d\n", i); 
+				printk("Skipping disabled node %d\n", i);
 			continue;
-		} 
+		}
 		if (nodeid >= numnodes) {
 			printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-			       base, limit); 
+			       base, limit);
 			continue;
-		} 
+		}
 
-		if (!limit) { 
-			printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i,
-			       base);
+		if (!limit) {
+			printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
+			       i, base);
 			continue;
 		}
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 
-			       nodeid, (base>>8)&3, (limit>>8) & 3); 
-			return -1; 
-		}	
+			printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
+			       nodeid, (base>>8)&3, (limit>>8) & 3);
+			return -1;
+		}
 		if (node_isset(nodeid, node_possible_map)) {
-			printk(KERN_INFO "Node %d already present. Skipping\n", 
+			printk(KERN_INFO "Node %d already present. Skipping\n",
 			       nodeid);
 			continue;
 		}
 
-		limit >>= 16; 
-		limit <<= 24; 
+		limit >>= 16;
+		limit <<= 24;
 		limit |= (1<<24)-1;
 		limit++;
 
 		if (limit > end_pfn << PAGE_SHIFT)
 			limit = end_pfn << PAGE_SHIFT;
 		if (limit <= base)
-			continue; 
-			
+			continue;
+
 		base >>= 16;
-		base <<= 24; 
-
-		if (base < start) 
-			base = start; 
-		if (limit > end) 
-			limit = end; 
-		if (limit == base) { 
-			printk(KERN_ERR "Empty node %d\n", nodeid); 
-			continue; 
+		base <<= 24;
+
+		if (base < start)
+			base = start;
+		if (limit > end)
+			limit = end;
+		if (limit == base) {
+			printk(KERN_ERR "Empty node %d\n", nodeid);
+			continue;
 		}
-		if (limit < base) { 
+		if (limit < base) {
 			printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
-			       nodeid, base, limit); 			       
+			       nodeid, base, limit);
 			continue;
-		} 
-		
+		}
+
 		/* Could sort here, but pun for now. Should not happen anyroads. */
-		if (prevbase > base) { 
+		if (prevbase > base) {
 			printk(KERN_ERR "Node map not sorted %lx,%lx\n",
-			       prevbase,base);
+			       prevbase, base);
 			return -1;
 		}
-			
-		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 
-		       nodeid, base, limit); 
-		
+
+		printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
+		       nodeid, base, limit);
+
 		found++;
-		
-		nodes[nodeid].start = base; 
+
+		nodes[nodeid].start = base;
 		nodes[nodeid].end = limit;
 		e820_register_active_regions(nodeid,
 				nodes[nodeid].start >> PAGE_SHIFT,
@@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		prevbase = base;
 
 		node_set(nodeid, node_possible_map);
-	} 
+	}
 
 	if (!found)
-		return -1; 
+		return -1;
 
 	memnode_shift = compute_hash_shift(nodes, 8);
-	if (memnode_shift < 0) { 
-		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 
-		return -1; 
-	} 
-	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 
+	if (memnode_shift < 0) {
+		printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
+		return -1;
+	}
+	printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
+
+	/* use the coreid bits from early_identify_cpu */
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = (1<<bits);
 
 	for (i = 0; i < 8; i++) {
-		if (nodes[i].start != nodes[i].end) { 
+		if (nodes[i].start != nodes[i].end) {
 			nodeid = nodeids[i];
-			for (j = 0; j < num_cores; j++)
-				apicid_to_node[(nodeid * num_cores) + j] = i;
-			setup_node_bootmem(i, nodes[i].start, nodes[i].end); 
-		} 
+			for (j = 0; j < cores; j++)
+				apicid_to_node[(nodeid << bits) + j] = i;
+			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+		}
 	}
 
 	numa_init_array();
 	return 0;
-} 
+}
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c
index 552e0847375..56fe7124fbe 100644
--- a/arch/x86/mm/mmap_32.c
+++ b/arch/x86/mm/mmap.c
@@ -1,10 +1,13 @@
 /*
- *  linux/arch/i386/mm/mmap.c
+ * Flexible mmap layout support
  *
- *  flexible mmap layout support
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
  *
  * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
  * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,14 +22,12 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- *
- * Started by Ingo Molnar <mingo@elte.hu>
  */
 
 #include <linux/personality.h>
 #include <linux/mm.h>
 #include <linux/random.h>
+#include <linux/limits.h>
 #include <linux/sched.h>
 
 /*
@@ -37,20 +38,71 @@
 #define MIN_GAP (128*1024*1024)
 #define MAX_GAP (TASK_SIZE/6*5)
 
-static inline unsigned long mmap_base(struct mm_struct *mm)
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static int mmap_is_ia32(void)
 {
-	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
-	unsigned long random_factor = 0;
+#ifdef CONFIG_X86_32
+	return 1;
+#endif
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		return 1;
+#endif
+	return 0;
+}
 
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = get_random_int() % (1024*1024);
+static int mmap_is_legacy(void)
+{
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+static unsigned long mmap_rnd(void)
+{
+	unsigned long rnd = 0;
+
+	/*
+	*  8 bits of randomness in 32bit mmaps, 20 address space bits
+	* 28 bits of randomness in 64bit mmaps, 40 address space bits
+	*/
+	if (current->flags & PF_RANDOMIZE) {
+		if (mmap_is_ia32())
+			rnd = (long)get_random_int() % (1<<8);
+		else
+			rnd = (long)(get_random_int() % (1<<28));
+	}
+	return rnd << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base(void)
+{
+	unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
 
 	if (gap < MIN_GAP)
 		gap = MIN_GAP;
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - random_factor);
+	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+}
+
+/*
+ * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
+ * does, but not when emulating X86_32
+ */
+static unsigned long mmap_legacy_base(void)
+{
+	if (mmap_is_ia32())
+		return TASK_UNMAPPED_BASE;
+	else
+		return TASK_UNMAPPED_BASE + mmap_rnd();
 }
 
 /*
@@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm)
  */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
-	/*
-	 * Fall back to the standard layout if the personality
-	 * bit is set, or if the expected stack growth is unlimited:
-	 */
-	if (sysctl_legacy_va_layout ||
-			(current->personality & ADDR_COMPAT_LAYOUT) ||
-			current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) {
-		mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (mmap_is_legacy()) {
+		mm->mmap_base = mmap_legacy_base();
 		mm->get_unmapped_area = arch_get_unmapped_area;
 		mm->unmap_area = arch_unmap_area;
 	} else {
-		mm->mmap_base = mmap_base(mm);
+		mm->mmap_base = mmap_base();
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 		mm->unmap_area = arch_unmap_area_topdown;
 	}
diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c
deleted file mode 100644
index 80bba0dc000..00000000000
--- a/arch/x86/mm/mmap_64.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2005 Andi Kleen, SuSE Labs.
- * Licensed under GPL, v.2
- */
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/random.h>
-#include <asm/ia32.h>
-
-/* Notebook: move the mmap code from sys_x86_64.c over here. */
-
-void arch_pick_mmap_layout(struct mm_struct *mm)
-{
-#ifdef CONFIG_IA32_EMULATION
-	if (current_thread_info()->flags & _TIF_IA32)
-		return ia32_pick_mmap_layout(mm);
-#endif
-	mm->mmap_base = TASK_UNMAPPED_BASE;
-	if (current->flags & PF_RANDOMIZE) {
-		/* Add 28bit randomness which is about 40bits of address space
-		   because mmap base has to be page aligned.
- 		   or ~1/128 of the total user VM
-	   	   (total user address space is 47bits) */
-		unsigned rnd = get_random_int() & 0xfffffff;
-		mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
-	}
-	mm->get_unmapped_area = arch_get_unmapped_area;
-	mm->unmap_area = arch_unmap_area;
-}
-
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3d6926ba899..dc3b1f7e145 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -1,7 +1,7 @@
-/* 
+/*
  * Generic VM initialization for x86-64 NUMA setups.
  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */ 
+ */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/string.h>
@@ -11,35 +11,45 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/nodemask.h>
+#include <linux/sched.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
 #include <asm/numa.h>
 #include <asm/acpi.h>
+#include <asm/k8.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
 #endif
 
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 struct memnode memnode;
 
-unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
+int x86_cpu_to_node_map_init[NR_CPUS] = {
 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
 };
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
- 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+void *x86_cpu_to_node_map_early_ptr;
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+
+s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
-cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
+
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_to_cpumask_map);
 
 int numa_off __initdata;
 unsigned long __initdata nodemap_addr;
 unsigned long __initdata nodemap_size;
 
-
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size;
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __init
-populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
+static int __init populate_memnodemap(const struct bootnode *nodes,
+				      int numnodes, int shift)
 {
-	int i; 
-	int res = -1;
 	unsigned long addr, end;
+	int i, res = -1;
 
-	memset(memnodemap, 0xff, memnodemapsize);
+	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
 	for (i = 0; i < numnodes; i++) {
 		addr = nodes[i].start;
 		end = nodes[i].end;
@@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
 		if ((end >> shift) >= memnodemapsize)
 			return 0;
 		do {
-			if (memnodemap[addr >> shift] != 0xff)
+			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
 				return -1;
 			memnodemap[addr >> shift] = i;
 			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
-	} 
+	}
 	return res;
 }
 
@@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void)
 	unsigned long pad, pad_addr;
 
 	memnodemap = memnode.embedded_map;
-	if (memnodemapsize <= 48)
+	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
 		return 0;
 
 	pad = L1_CACHE_BYTES - 1;
 	pad_addr = 0x8000;
-	nodemap_size = pad + memnodemapsize;
+	nodemap_size = pad + sizeof(s16) * memnodemapsize;
 	nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
 				      nodemap_size);
 	if (nodemap_addr == -1UL) {
@@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 	}
 	pad_addr = (nodemap_addr + pad) & ~pad;
 	memnodemap = phys_to_virt(pad_addr);
+	reserve_early(nodemap_addr, nodemap_addr + nodemap_size);
 
 	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 	       nodemap_addr, nodemap_addr + nodemap_size);
@@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void)
  * The LSB of all start and end addresses in the node map is the value of the
  * maximum possible shift.
  */
-static int __init
-extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
+static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
+					 int numnodes)
 {
 	int i, nodes_used = 0;
 	unsigned long start, end;
@@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
 		shift);
 
 	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
-		printk(KERN_INFO
-	"Your memory is not aligned you need to rebuild your kernel "
-	"with a bigger NODEMAPSIZE shift=%d\n",
-			shift);
+		printk(KERN_INFO "Your memory is not aligned you need to "
+		       "rebuild your kernel with a bigger NODEMAPSIZE "
+		       "shift=%d\n", shift);
 		return -1;
 	}
 	return shift;
 }
 
-#ifdef CONFIG_SPARSEMEM
 int early_pfn_to_nid(unsigned long pfn)
 {
 	return phys_to_nid(pfn << PAGE_SHIFT);
 }
-#endif
 
-static void * __init
-early_node_mem(int nodeid, unsigned long start, unsigned long end,
-	      unsigned long size)
+static void * __init early_node_mem(int nodeid, unsigned long start,
+				    unsigned long end, unsigned long size)
 {
 	unsigned long mem = find_e820_area(start, end, size);
 	void *ptr;
+
 	if (mem != -1L)
 		return __va(mem);
 	ptr = __alloc_bootmem_nopanic(size,
 				SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
 	if (ptr == NULL) {
 		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-			size, nodeid);
+		       size, nodeid);
 		return NULL;
 	}
 	return ptr;
 }
 
 /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
-{ 
-	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
-	unsigned long nodedata_phys;
+void __init setup_node_bootmem(int nodeid, unsigned long start,
+			       unsigned long end)
+{
+	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size;
+	unsigned long bootmap_start, nodedata_phys;
 	void *bootmap;
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 
-	start = round_up(start, ZONE_ALIGN); 
+	start = round_up(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+	       start, end);
 
 	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
@@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 
 	/* Find a place for the bootmem map */
-	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
+	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
 	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
 	bootmap = early_node_mem(nodeid, bootmap_start, end,
 					bootmap_pages<<PAGE_SHIFT);
 	if (bootmap == NULL)  {
 		if (nodedata_phys < start || nodedata_phys >= end)
-			free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+			free_bootmem((unsigned long)node_data[nodeid],
+				     pgdat_size);
 		node_data[nodeid] = NULL;
 		return;
 	}
 	bootmap_start = __pa(bootmap);
-	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
-	
+	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
+
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
-					 bootmap_start >> PAGE_SHIFT, 
-					 start_pfn, end_pfn); 
+					 bootmap_start >> PAGE_SHIFT,
+					 start_pfn, end_pfn);
 
 	free_bootmem_with_active_regions(nodeid, end);
 
-	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
-	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
+	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+			     bootmap_pages<<PAGE_SHIFT);
 #ifdef CONFIG_ACPI_NUMA
 	srat_reserve_add_area(nodeid);
 #endif
 	node_set_online(nodeid);
-} 
-
-/* Initialize final allocator for a zone */
-void __init setup_node_zones(int nodeid)
-{ 
-	unsigned long start_pfn, end_pfn, memmapsize, limit;
-
- 	start_pfn = node_start_pfn(nodeid);
- 	end_pfn = node_end_pfn(nodeid);
-
-	Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
-		nodeid, start_pfn, end_pfn);
-
-	/* Try to allocate mem_map at end to not fill up precious <4GB
-	   memory. */
-	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
-	limit = end_pfn << PAGE_SHIFT;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	NODE_DATA(nodeid)->node_mem_map = 
-		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
-				memmapsize, SMP_CACHE_BYTES, 
-				round_down(limit - memmapsize, PAGE_SIZE), 
-				limit);
-#endif
-} 
+}
 
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
 void __init numa_init_array(void)
 {
 	int rr, i;
-	/* There are unfortunately some poorly designed mainboards around
-	   that only connect memory to a single CPU. This breaks the 1:1 cpu->node
-	   mapping. To avoid this fill in the mapping for all possible
-	   CPUs, as the number of CPUs is not known yet. 
-	   We round robin the existing nodes. */
+
 	rr = first_node(node_online_map);
 	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_to_node(i) != NUMA_NO_NODE)
+		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 			continue;
- 		numa_set_node(i, rr);
+		numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
 	}
-
 }
 
 #ifdef CONFIG_NUMA_EMU
@@ -276,15 +265,17 @@ void __init numa_init_array(void)
 char *cmdline __initdata;
 
 /*
- * Setups up nid to range from addr to addr + size.  If the end boundary is
- * greater than max_addr, then max_addr is used instead.  The return value is 0
- * if there is additional memory left for allocation past addr and -1 otherwise.
- * addr is adjusted to be at the end of the node.
+ * Setups up nid to range from addr to addr + size.  If the end
+ * boundary is greater than max_addr, then max_addr is used instead.
+ * The return value is 0 if there is additional memory left for
+ * allocation past addr and -1 otherwise.  addr is adjusted to be at
+ * the end of the node.
  */
 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
 				   u64 size, u64 max_addr)
 {
 	int ret = 0;
+
 	nodes[nid].start = *addr;
 	*addr += size;
 	if (*addr >= max_addr) {
@@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
 
 	for (i = node_start; i < num_nodes + node_start; i++) {
 		u64 end = *addr + size;
+
 		if (i < big)
 			end += FAKE_NODE_MIN_SIZE;
 		/*
@@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct bootnode nodes[MAX_NUMNODES];
-	u64 addr = start_pfn << PAGE_SHIFT;
+	u64 size, addr = start_pfn << PAGE_SHIFT;
 	u64 max_addr = end_pfn << PAGE_SHIFT;
-	int num_nodes = 0;
-	int coeff_flag;
-	int coeff = -1;
-	int num = 0;
-	u64 size;
-	int i;
+	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
 
 	memset(&nodes, 0, sizeof(nodes));
 	/*
@@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 	 * system RAM into N fake nodes.
 	 */
 	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
-		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
-						simple_strtol(cmdline, NULL, 0));
+		long n = simple_strtol(cmdline, NULL, 0);
+
+		num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
 		if (num_nodes < 0)
 			return num_nodes;
 		goto out;
@@ -483,46 +471,47 @@ out:
 	for_each_node_mask(i, node_possible_map) {
 		e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 						nodes[i].end >> PAGE_SHIFT);
- 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 	}
 	acpi_fake_nodes(nodes, num_nodes);
- 	numa_init_array();
- 	return 0;
+	numa_init_array();
+	return 0;
 }
 #endif /* CONFIG_NUMA_EMU */
 
 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
-{ 
+{
 	int i;
 
 	nodes_clear(node_possible_map);
 
 #ifdef CONFIG_NUMA_EMU
 	if (cmdline && !numa_emulation(start_pfn, end_pfn))
- 		return;
+		return;
 	nodes_clear(node_possible_map);
 #endif
 
 #ifdef CONFIG_ACPI_NUMA
 	if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 					  end_pfn << PAGE_SHIFT))
- 		return;
+		return;
 	nodes_clear(node_possible_map);
 #endif
 
 #ifdef CONFIG_K8_NUMA
-	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
+	if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
+					end_pfn<<PAGE_SHIFT))
 		return;
 	nodes_clear(node_possible_map);
 #endif
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
 
-	printk(KERN_INFO "Faking a node at %016lx-%016lx\n", 
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 	       start_pfn << PAGE_SHIFT,
-	       end_pfn << PAGE_SHIFT); 
-		/* setup dummy node covering all memory */ 
-	memnode_shift = 63; 
+	       end_pfn << PAGE_SHIFT);
+	/* setup dummy node covering all memory */
+	memnode_shift = 63;
 	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
 	nodes_clear(node_online_map);
@@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	node_set(0, node_possible_map);
 	for (i = 0; i < NR_CPUS; i++)
 		numa_set_node(i, 0);
-	node_to_cpumask[0] = cpumask_of_cpu(0);
+	/* cpumask_of_cpu() may not be available during early startup */
+	memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
+	cpu_set(0, node_to_cpumask_map[0]);
 	e820_register_active_regions(0, start_pfn, end_pfn);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
 __cpuinit void numa_add_cpu(int cpu)
 {
-	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
-} 
+	set_bit(cpu,
+		(unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
+	int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
+
 	cpu_pda(cpu)->nodenumber = node;
-	cpu_to_node(cpu) = node;
+
+	if(cpu_to_node_map)
+		cpu_to_node_map[cpu] = node;
+	else if(per_cpu_offset(cpu))
+		per_cpu(x86_cpu_to_node_map, cpu) = node;
+	else
+		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
 }
 
-unsigned long __init numa_free_all_bootmem(void) 
-{ 
-	int i;
+unsigned long __init numa_free_all_bootmem(void)
+{
 	unsigned long pages = 0;
-	for_each_online_node(i) {
+	int i;
+
+	for_each_online_node(i)
 		pages += free_all_bootmem_node(NODE_DATA(i));
-	}
+
 	return pages;
-} 
+}
 
 void __init paging_init(void)
-{ 
-	int i;
+{
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
@@ -568,32 +569,27 @@ void __init paging_init(void)
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 
-	for_each_online_node(i) {
-		setup_node_zones(i); 
-	}
-
 	free_area_init_nodes(max_zone_pfns);
-} 
+}
 
 static __init int numa_setup(char *opt)
-{ 
+{
 	if (!opt)
 		return -EINVAL;
-	if (!strncmp(opt,"off",3))
+	if (!strncmp(opt, "off", 3))
 		numa_off = 1;
 #ifdef CONFIG_NUMA_EMU
 	if (!strncmp(opt, "fake=", 5))
 		cmdline = opt + 5;
 #endif
 #ifdef CONFIG_ACPI_NUMA
- 	if (!strncmp(opt,"noacpi",6))
- 		acpi_numa = -1;
-	if (!strncmp(opt,"hotadd=", 7))
+	if (!strncmp(opt, "noacpi", 6))
+		acpi_numa = -1;
+	if (!strncmp(opt, "hotadd=", 7))
 		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 0;
-} 
-
+}
 early_param("numa", numa_setup);
 
 /*
@@ -611,38 +607,16 @@ early_param("numa", numa_setup);
 void __init init_cpu_to_node(void)
 {
 	int i;
- 	for (i = 0; i < NR_CPUS; i++) {
-		u8 apicid = x86_cpu_to_apicid_init[i];
+
+	for (i = 0; i < NR_CPUS; i++) {
+		u16 apicid = x86_cpu_to_apicid_init[i];
+
 		if (apicid == BAD_APICID)
 			continue;
 		if (apicid_to_node[apicid] == NUMA_NO_NODE)
 			continue;
-		numa_set_node(i,apicid_to_node[apicid]);
+		numa_set_node(i, apicid_to_node[apicid]);
 	}
 }
 
-EXPORT_SYMBOL(cpu_to_node);
-EXPORT_SYMBOL(node_to_cpumask);
-EXPORT_SYMBOL(memnode);
-EXPORT_SYMBOL(node_data);
-
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * Functions to convert PFNs from/to per node page addresses.
- * These are out of line because they are quite big.
- * They could be all tuned by pre caching more state.
- * Should do that.
- */
 
-int pfn_valid(unsigned long pfn)
-{
-	unsigned nid;
-	if (pfn >= num_physpages)
-		return 0;
-	nid = pfn_to_nid(pfn);
-	if (nid == 0xff)
-		return 0;
-	return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
-}
-EXPORT_SYMBOL(pfn_valid);
-#endif
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 00000000000..06353d43f72
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,224 @@
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the global bit on random pages in the direct mapping, then reverts
+ * and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+enum {
+	NTEST			= 4000,
+#ifdef CONFIG_X86_64
+	LPS			= (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+	LPS			= (1 << PMD_SHIFT),
+#else
+	LPS			= (1 << 22),
+#endif
+	GPS			= (1<<30)
+};
+
+struct split_state {
+	long lpg, gpg, spg, exec;
+	long min_exec, max_exec;
+};
+
+static __init int print_split(struct split_state *s)
+{
+	long i, expected, missed = 0;
+	int printed = 0;
+	int err = 0;
+
+	s->lpg = s->gpg = s->spg = s->exec = 0;
+	s->min_exec = ~0UL;
+	s->max_exec = 0;
+	for (i = 0; i < max_pfn_mapped; ) {
+		unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+		int level;
+		pte_t *pte;
+
+		pte = lookup_address(addr, &level);
+		if (!pte) {
+			if (!printed) {
+				dump_pagetable(addr);
+				printk(KERN_INFO "CPA %lx no pte level %d\n",
+					addr, level);
+				printed = 1;
+			}
+			missed++;
+			i++;
+			continue;
+		}
+
+		if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+			s->gpg++;
+			i += GPS/PAGE_SIZE;
+		} else if (level == PG_LEVEL_2M) {
+			if (!(pte_val(*pte) & _PAGE_PSE)) {
+				printk(KERN_ERR
+					"%lx level %d but not PSE %Lx\n",
+					addr, level, (u64)pte_val(*pte));
+				err = 1;
+			}
+			s->lpg++;
+			i += LPS/PAGE_SIZE;
+		} else {
+			s->spg++;
+			i++;
+		}
+		if (!(pte_val(*pte) & _PAGE_NX)) {
+			s->exec++;
+			if (addr < s->min_exec)
+				s->min_exec = addr;
+			if (addr > s->max_exec)
+				s->max_exec = addr;
+		}
+	}
+	printk(KERN_INFO
+		"CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+		s->spg, s->lpg, s->gpg, s->exec,
+		s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed);
+
+	expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+	if (expected != i) {
+		printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+			max_pfn_mapped, expected);
+		return 1;
+	}
+	return err;
+}
+
+static unsigned long __initdata addr[NTEST];
+static unsigned int __initdata len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static __init int exercise_pageattr(void)
+{
+	struct split_state sa, sb, sc;
+	unsigned long *bm;
+	pte_t *pte, pte0;
+	int failed = 0;
+	int level;
+	int i, k;
+	int err;
+
+	printk(KERN_INFO "CPA exercising pageattr\n");
+
+	bm = vmalloc((max_pfn_mapped + 7) / 8);
+	if (!bm) {
+		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+		return -ENOMEM;
+	}
+	memset(bm, 0, (max_pfn_mapped + 7) / 8);
+
+	failed += print_split(&sa);
+	srandom32(100);
+
+	for (i = 0; i < NTEST; i++) {
+		unsigned long pfn = random32() % max_pfn_mapped;
+
+		addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+		len[i] = random32() % 100;
+		len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+		if (len[i] == 0)
+			len[i] = 1;
+
+		pte = NULL;
+		pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+		for (k = 0; k < len[i]; k++) {
+			pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+			if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) {
+				addr[i] = 0;
+				break;
+			}
+			if (k == 0) {
+				pte0 = *pte;
+			} else {
+				if (pgprot_val(pte_pgprot(*pte)) !=
+					pgprot_val(pte_pgprot(pte0))) {
+					len[i] = k;
+					break;
+				}
+			}
+			if (test_bit(pfn + k, bm)) {
+				len[i] = k;
+				break;
+			}
+			__set_bit(pfn + k, bm);
+		}
+		if (!addr[i] || !pte || !k) {
+			addr[i] = 0;
+			continue;
+		}
+
+		err = change_page_attr_clear(addr[i], len[i],
+					       __pgprot(_PAGE_GLOBAL));
+		if (err < 0) {
+			printk(KERN_ERR "CPA %d failed %d\n", i, err);
+			failed++;
+		}
+
+		pte = lookup_address(addr[i], &level);
+		if (!pte || pte_global(*pte) || pte_huge(*pte)) {
+			printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+				pte ? (u64)pte_val(*pte) : 0ULL);
+			failed++;
+		}
+		if (level != PG_LEVEL_4K) {
+			printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+				addr[i], level);
+			failed++;
+		}
+
+	}
+	vfree(bm);
+
+	failed += print_split(&sb);
+
+	printk(KERN_INFO "CPA reverting everything\n");
+	for (i = 0; i < NTEST; i++) {
+		if (!addr[i])
+			continue;
+		pte = lookup_address(addr[i], &level);
+		if (!pte) {
+			printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+			failed++;
+			continue;
+		}
+		err = change_page_attr_set(addr[i], len[i],
+					     __pgprot(_PAGE_GLOBAL));
+		if (err < 0) {
+			printk(KERN_ERR "CPA reverting failed: %d\n", err);
+			failed++;
+		}
+		pte = lookup_address(addr[i], &level);
+		if (!pte || !pte_global(*pte)) {
+			printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+				addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+			failed++;
+		}
+
+	}
+
+	failed += print_split(&sc);
+
+	if (failed) {
+		printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n");
+		WARN_ON(1);
+	} else {
+		printk(KERN_INFO "CPA selftests PASSED\n");
+	}
+
+	return 0;
+}
+module_init(exercise_pageattr);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 00000000000..1cc6607eacb
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr:	virtual start address
+ * @size:	number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+	void *vend = vaddr + size - 1;
+
+	mb();
+
+	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+		clflush(vaddr);
+	/*
+	 * Flush any possible final partial cacheline:
+	 */
+	clflush(vend);
+
+	mb();
+}
+
+static void __cpa_flush_all(void *arg)
+{
+	/*
+	 * Flush all to work around Errata in early athlons regarding
+	 * large page flushing.
+	 */
+	__flush_tlb_all();
+
+	if (boot_cpu_data.x86_model >= 4)
+		wbinvd();
+}
+
+static void cpa_flush_all(void)
+{
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, NULL, 1, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+	/*
+	 * We could optimize that further and do individual per page
+	 * tlb invalidates for a low number of pages. Caveat: we must
+	 * flush the high aliases on 64bit as well.
+	 */
+	__flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages)
+{
+	unsigned int i, level;
+	unsigned long addr;
+
+	BUG_ON(irqs_disabled());
+	WARN_ON(PAGE_ALIGN(start) != start);
+
+	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+		pte_t *pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && pte_present(*pte))
+			clflush_cache_range((void *) addr, PAGE_SIZE);
+	}
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address)
+{
+	pgprot_t forbidden = __pgprot(0);
+
+	/*
+	 * The BIOS area between 640k and 1Mb needs to be executable for
+	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+	 */
+	if (within(__pa(address), BIOS_BEGIN, BIOS_END))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
+	/*
+	 * The kernel text needs to be executable for obvious reasons
+	 * Does not cover __inittext since that is gone later on
+	 */
+	if (within(address, (unsigned long)_text, (unsigned long)_etext))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
+#ifdef CONFIG_DEBUG_RODATA
+	/* The .rodata section needs to be read-only */
+	if (within(address, (unsigned long)__start_rodata,
+				(unsigned long)__end_rodata))
+		pgprot_val(forbidden) |= _PAGE_RW;
+#endif
+
+	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+	return prot;
+}
+
+pte_t *lookup_address(unsigned long address, int *level)
+{
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*level = PG_LEVEL_NONE;
+
+	if (pgd_none(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud))
+		return NULL;
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd))
+		return NULL;
+
+	*level = PG_LEVEL_2M;
+	if (pmd_large(*pmd))
+		return (pte_t *)pmd;
+
+	*level = PG_LEVEL_4K;
+	return pte_offset_kernel(pmd, address);
+}
+
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+	/* change init_mm */
+	set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+	if (!SHARED_KERNEL_PMD) {
+		struct page *page;
+
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pud = pud_offset(pgd, address);
+			pmd = pmd_offset(pud, address);
+			set_pte_atomic((pte_t *)pmd, pte);
+		}
+	}
+#endif
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+	pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+	gfp_t gfp_flags = GFP_KERNEL;
+	unsigned long flags;
+	unsigned long addr;
+	pte_t *pbase, *tmp;
+	struct page *base;
+	unsigned int i, level;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN;
+	gfp_flags = GFP_ATOMIC | __GFP_NOWARN;
+#endif
+	base = alloc_pages(gfp_flags, 0);
+	if (!base)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up for us already:
+	 */
+	tmp = lookup_address(address, &level);
+	if (tmp != kpte) {
+		WARN_ON_ONCE(1);
+		goto out_unlock;
+	}
+
+	address = __pa(address);
+	addr = address & LARGE_PAGE_MASK;
+	pbase = (pte_t *)page_address(base);
+#ifdef CONFIG_X86_32
+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+#endif
+
+	pgprot_val(ref_prot) &= ~_PAGE_NX;
+	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE)
+		set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot));
+
+	/*
+	 * Install the new, split up pagetable. Important detail here:
+	 *
+	 * On Intel the NX bit of all levels must be cleared to make a
+	 * page executable. See section 4.13.2 of Intel 64 and IA-32
+	 * Architectures Software Developer's Manual).
+	 */
+	ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte)));
+	__set_pmd_pte(kpte, address, mk_pte(base, ref_prot));
+	base = NULL;
+
+out_unlock:
+	spin_unlock_irqrestore(&pgd_lock, flags);
+
+	if (base)
+		__free_pages(base, 0);
+
+	return 0;
+}
+
+static int
+__change_page_attr(unsigned long address, unsigned long pfn,
+		   pgprot_t mask_set, pgprot_t mask_clr)
+{
+	struct page *kpte_page;
+	int level, err = 0;
+	pte_t *kpte;
+
+#ifdef CONFIG_X86_32
+	BUG_ON(pfn > max_low_pfn);
+#endif
+
+repeat:
+	kpte = lookup_address(address, &level);
+	if (!kpte)
+		return -EINVAL;
+
+	kpte_page = virt_to_page(kpte);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
+
+	if (level == PG_LEVEL_4K) {
+		pgprot_t new_prot = pte_pgprot(*kpte);
+		pte_t new_pte, old_pte = *kpte;
+
+		pgprot_val(new_prot) &= ~pgprot_val(mask_clr);
+		pgprot_val(new_prot) |= pgprot_val(mask_set);
+
+		new_prot = static_protections(new_prot, address);
+
+		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+		BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte));
+
+		set_pte_atomic(kpte, new_pte);
+	} else {
+		err = split_large_page(kpte, address);
+		if (!err)
+			goto repeat;
+	}
+	return err;
+}
+
+/**
+ * change_page_attr_addr - Change page table attributes in linear mapping
+ * @address: Virtual address in linear mapping.
+ * @prot:    New page table attribute (PAGE_*)
+ *
+ * Change page attributes of a page in the direct mapping. This is a variant
+ * of change_page_attr() that also works on memory holes that do not have
+ * mem_map entry (pfn_valid() is false).
+ *
+ * See change_page_attr() documentation for more details.
+ *
+ * Modules and drivers should use the set_memory_* APIs instead.
+ */
+
+#define HIGH_MAP_START	__START_KERNEL_map
+#define HIGH_MAP_END	(__START_KERNEL_map + KERNEL_TEXT_SIZE)
+
+static int
+change_page_attr_addr(unsigned long address, pgprot_t mask_set,
+		      pgprot_t mask_clr)
+{
+	unsigned long phys_addr = __pa(address);
+	unsigned long pfn = phys_addr >> PAGE_SHIFT;
+	int err;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If we are inside the high mapped kernel range, then we
+	 * fixup the low mapping first. __va() returns the virtual
+	 * address in the linear mapping:
+	 */
+	if (within(address, HIGH_MAP_START, HIGH_MAP_END))
+		address = (unsigned long) __va(phys_addr);
+#endif
+
+	err = __change_page_attr(address, pfn, mask_set, mask_clr);
+	if (err)
+		return err;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If the physical address is inside the kernel map, we need
+	 * to touch the high mapped kernel as well:
+	 */
+	if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) {
+		/*
+		 * Calc the high mapping address. See __phys_addr()
+		 * for the non obvious details.
+		 */
+		address = phys_addr + HIGH_MAP_START - phys_base;
+		/* Make sure the kernel mappings stay executable */
+		pgprot_val(mask_clr) |= _PAGE_NX;
+
+		/*
+		 * Our high aliases are imprecise, because we check
+		 * everything between 0 and KERNEL_TEXT_SIZE, so do
+		 * not propagate lookup failures back to users:
+		 */
+		__change_page_attr(address, pfn, mask_set, mask_clr);
+	}
+#endif
+	return err;
+}
+
+static int __change_page_attr_set_clr(unsigned long addr, int numpages,
+				      pgprot_t mask_set, pgprot_t mask_clr)
+{
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) {
+		ret = change_page_attr_addr(addr, mask_set, mask_clr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int change_page_attr_set_clr(unsigned long addr, int numpages,
+				    pgprot_t mask_set, pgprot_t mask_clr)
+{
+	int ret = __change_page_attr_set_clr(addr, numpages, mask_set,
+					     mask_clr);
+
+	/*
+	 * On success we use clflush, when the CPU supports it to
+	 * avoid the wbindv. If the CPU does not support it and in the
+	 * error case we fall back to cpa_flush_all (which uses
+	 * wbindv):
+	 */
+	if (!ret && cpu_has_clflush)
+		cpa_flush_range(addr, numpages);
+	else
+		cpa_flush_all();
+
+	return ret;
+}
+
+static inline int change_page_attr_set(unsigned long addr, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int change_page_attr_clear(unsigned long addr, int numpages,
+					 pgprot_t mask)
+{
+	return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages,
+				    __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages,
+				      __pgprot(_PAGE_PCD | _PAGE_PWT));
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX));
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW));
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_rw(addr, numpages);
+}
+
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG)
+static inline int __change_page_attr_set(unsigned long addr, int numpages,
+					 pgprot_t mask)
+{
+	return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+}
+
+static inline int __change_page_attr_clear(unsigned long addr, int numpages,
+					   pgprot_t mask)
+{
+	return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+}
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return __change_page_attr_set(addr, numpages,
+				      __pgprot(_PAGE_PRESENT | _PAGE_RW));
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return __change_page_attr_clear(addr, numpages,
+					__pgprot(_PAGE_PRESENT));
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
+		return;
+	if (!enable) {
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
+	}
+
+	/*
+	 * If page allocator is not up yet then do not call c_p_a():
+	 */
+	if (!debug_pagealloc_enabled)
+		return;
+
+	/*
+	 * The return value is ignored - the calls cannot fail,
+	 * large pages are disabled at boot time:
+	 */
+	if (enable)
+		__set_pages_p(page, numpages);
+	else
+		__set_pages_np(page, numpages);
+
+	/*
+	 * We should perform an IPI and flush all tlbs,
+	 * but that can deadlock->flush only current cpu:
+	 */
+	__flush_tlb_all();
+}
+#endif
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c
deleted file mode 100644
index 260073c0760..00000000000
--- a/arch/x86/mm/pageattr_32.c
+++ /dev/null
@@ -1,278 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-
-static DEFINE_SPINLOCK(cpa_lock);
-static struct list_head df_list = LIST_HEAD_INIT(df_list);
-
-
-pte_t *lookup_address(unsigned long address) 
-{ 
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud;
-	pmd_t *pmd;
-	if (pgd_none(*pgd))
-		return NULL;
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud))
-		return NULL;
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd))
-		return NULL;
-	if (pmd_large(*pmd))
-		return (pte_t *)pmd;
-        return pte_offset_kernel(pmd, address);
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-					pgprot_t ref_prot)
-{ 
-	int i; 
-	unsigned long addr;
-	struct page *base;
-	pte_t *pbase;
-
-	spin_unlock_irq(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL, 0);
-	spin_lock_irq(&cpa_lock);
-	if (!base) 
-		return NULL;
-
-	/*
-	 * page_private is used to track the number of entries in
-	 * the page table page that have non standard attributes.
-	 */
-	SetPagePrivate(base);
-	page_private(base) = 0;
-
-	address = __pa(address);
-	addr = address & LARGE_PAGE_MASK; 
-	pbase = (pte_t *)page_address(base);
-	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-               set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
-                                          addr == address ? prot : ref_prot));
-	}
-	return base;
-} 
-
-static void cache_flush_page(struct page *p)
-{ 
-	void *adr = page_address(p);
-	int i;
-	for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
-		clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
-	struct list_head *lh = (struct list_head *)arg;
-	struct page *p;
-
-	/* High level code is not ready for clflush yet */
-	if (0 && cpu_has_clflush) {
-		list_for_each_entry (p, lh, lru)
-			cache_flush_page(p);
-	} else if (boot_cpu_data.x86_model >= 4)
-		wbinvd();
-
-	/* Flush all to work around Errata in early athlons regarding 
-	 * large page flushing. 
-	 */
-	__flush_tlb_all(); 	
-}
-
-static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
-{ 
-	struct page *page;
-	unsigned long flags;
-
-	set_pte_atomic(kpte, pte); 	/* change init_mm */
-	if (SHARED_KERNEL_PMD)
-		return;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	for (page = pgd_list; page; page = (struct page *)page->index) {
-		pgd_t *pgd;
-		pud_t *pud;
-		pmd_t *pmd;
-		pgd = (pgd_t *)page_address(page) + pgd_index(address);
-		pud = pud_offset(pgd, address);
-		pmd = pmd_offset(pud, address);
-		set_pte_atomic((pte_t *)pmd, pte);
-	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static inline void revert_page(struct page *kpte_page, unsigned long address)
-{
-	pgprot_t ref_prot;
-	pte_t *linear;
-
-	ref_prot =
-	((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-		? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
-
-	linear = (pte_t *)
-		pmd_offset(pud_offset(pgd_offset_k(address), address), address);
-	set_pmd_pte(linear,  address,
-		    pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
-			    ref_prot));
-}
-
-static inline void save_page(struct page *kpte_page)
-{
-	if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
-		list_add(&kpte_page->lru, &df_list);
-}
-
-static int
-__change_page_attr(struct page *page, pgprot_t prot)
-{ 
-	pte_t *kpte; 
-	unsigned long address;
-	struct page *kpte_page;
-
-	BUG_ON(PageHighMem(page));
-	address = (unsigned long)page_address(page);
-
-	kpte = lookup_address(address);
-	if (!kpte)
-		return -EINVAL;
-	kpte_page = virt_to_page(kpte);
-	BUG_ON(PageLRU(kpte_page));
-	BUG_ON(PageCompound(kpte_page));
-
-	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
-		if (!pte_huge(*kpte)) {
-			set_pte_atomic(kpte, mk_pte(page, prot)); 
-		} else {
-			pgprot_t ref_prot;
-			struct page *split;
-
-			ref_prot =
-			((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
-				? PAGE_KERNEL_EXEC : PAGE_KERNEL;
-			split = split_large_page(address, prot, ref_prot);
-			if (!split)
-				return -ENOMEM;
-			set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
-			kpte_page = split;
-		}
-		page_private(kpte_page)++;
-	} else if (!pte_huge(*kpte)) {
-		set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
-		BUG_ON(page_private(kpte_page) == 0);
-		page_private(kpte_page)--;
-	} else
-		BUG();
-
-	/*
-	 * If the pte was reserved, it means it was created at boot
-	 * time (not via split_large_page) and in turn we must not
-	 * replace it with a largepage.
-	 */
-
-	save_page(kpte_page);
-	if (!PageReserved(kpte_page)) {
-		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-			paravirt_release_pt(page_to_pfn(kpte_page));
-			revert_page(kpte_page, address);
-		}
-	}
-	return 0;
-} 
-
-static inline void flush_map(struct list_head *l)
-{
-	on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-	int err = 0; 
-	int i; 
-	unsigned long flags;
-
-	spin_lock_irqsave(&cpa_lock, flags);
-	for (i = 0; i < numpages; i++, page++) { 
-		err = __change_page_attr(page, prot);
-		if (err) 
-			break; 
-	} 	
-	spin_unlock_irqrestore(&cpa_lock, flags);
-	return err;
-}
-
-void global_flush_tlb(void)
-{
-	struct list_head l;
-	struct page *pg, *next;
-
-	BUG_ON(irqs_disabled());
-
-	spin_lock_irq(&cpa_lock);
-	list_replace_init(&df_list, &l);
-	spin_unlock_irq(&cpa_lock);
-	flush_map(&l);
-	list_for_each_entry_safe(pg, next, &l, lru) {
-		list_del(&pg->lru);
-		clear_bit(PG_arch_1, &pg->flags);
-		if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
-			continue;
-		ClearPagePrivate(pg);
-		__free_page(pg);
-	}
-}
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-void kernel_map_pages(struct page *page, int numpages, int enable)
-{
-	if (PageHighMem(page))
-		return;
-	if (!enable)
-		debug_check_no_locks_freed(page_address(page),
-					   numpages * PAGE_SIZE);
-
-	/* the return value is ignored - the calls cannot fail,
-	 * large pages are disabled at boot time.
-	 */
-	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
-	/* we should perform an IPI and flush all tlbs,
-	 * but that can deadlock->flush only current cpu.
-	 */
-	__flush_tlb_all();
-}
-#endif
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c
deleted file mode 100644
index c40afbaaf93..00000000000
--- a/arch/x86/mm/pageattr_64.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* 
- * Copyright 2002 Andi Kleen, SuSE Labs. 
- * Thanks to Ben LaHaise for precious feedback.
- */ 
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-pte_t *lookup_address(unsigned long address)
-{ 
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	if (pgd_none(*pgd))
-		return NULL;
-	pud = pud_offset(pgd, address);
-	if (!pud_present(*pud))
-		return NULL; 
-	pmd = pmd_offset(pud, address);
-	if (!pmd_present(*pmd))
-		return NULL; 
-	if (pmd_large(*pmd))
-		return (pte_t *)pmd;
-	pte = pte_offset_kernel(pmd, address);
-	if (pte && !pte_present(*pte))
-		pte = NULL; 
-	return pte;
-} 
-
-static struct page *split_large_page(unsigned long address, pgprot_t prot,
-				     pgprot_t ref_prot)
-{ 
-	int i; 
-	unsigned long addr;
-	struct page *base = alloc_pages(GFP_KERNEL, 0);
-	pte_t *pbase;
-	if (!base) 
-		return NULL;
-	/*
-	 * page_private is used to track the number of entries in
-	 * the page table page have non standard attributes.
-	 */
-	SetPagePrivate(base);
-	page_private(base) = 0;
-
-	address = __pa(address);
-	addr = address & LARGE_PAGE_MASK; 
-	pbase = (pte_t *)page_address(base);
-	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
-		pbase[i] = pfn_pte(addr >> PAGE_SHIFT, 
-				   addr == address ? prot : ref_prot);
-	}
-	return base;
-} 
-
-void clflush_cache_range(void *adr, int size)
-{
-	int i;
-	for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size)
-		clflush(adr+i);
-}
-
-static void flush_kernel_map(void *arg)
-{
-	struct list_head *l = (struct list_head *)arg;
-	struct page *pg;
-
-	/* When clflush is available always use it because it is
-	   much cheaper than WBINVD. */
-	/* clflush is still broken. Disable for now. */
-	if (1 || !cpu_has_clflush)
-		asm volatile("wbinvd" ::: "memory");
-	else list_for_each_entry(pg, l, lru) {
-		void *adr = page_address(pg);
-		clflush_cache_range(adr, PAGE_SIZE);
-	}
-	__flush_tlb_all();
-}
-
-static inline void flush_map(struct list_head *l)
-{	
-	on_each_cpu(flush_kernel_map, l, 1, 1);
-}
-
-static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
-
-static inline void save_page(struct page *fpage)
-{
-	if (!test_and_set_bit(PG_arch_1, &fpage->flags))
-		list_add(&fpage->lru, &deferred_pages);
-}
-
-/* 
- * No more special protections in this 2/4MB area - revert to a
- * large page again. 
- */
-static void revert_page(unsigned long address, pgprot_t ref_prot)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t large_pte;
-	unsigned long pfn;
-
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd));
-	pud = pud_offset(pgd,address);
-	BUG_ON(pud_none(*pud));
-	pmd = pmd_offset(pud, address);
-	BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
-	pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
-	large_pte = pfn_pte(pfn, ref_prot);
-	large_pte = pte_mkhuge(large_pte);
-	set_pte((pte_t *)pmd, large_pte);
-}      
-
-static int
-__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
-				   pgprot_t ref_prot)
-{ 
-	pte_t *kpte; 
-	struct page *kpte_page;
-	pgprot_t ref_prot2;
-
-	kpte = lookup_address(address);
-	if (!kpte) return 0;
-	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
-	BUG_ON(PageLRU(kpte_page));
-	BUG_ON(PageCompound(kpte_page));
-	if (pgprot_val(prot) != pgprot_val(ref_prot)) { 
-		if (!pte_huge(*kpte)) {
-			set_pte(kpte, pfn_pte(pfn, prot));
-		} else {
- 			/*
-			 * split_large_page will take the reference for this
-			 * change_page_attr on the split page.
- 			 */
-			struct page *split;
-			ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
-			split = split_large_page(address, prot, ref_prot2);
-			if (!split)
-				return -ENOMEM;
-			pgprot_val(ref_prot2) &= ~_PAGE_NX;
-			set_pte(kpte, mk_pte(split, ref_prot2));
-			kpte_page = split;
-		}
-		page_private(kpte_page)++;
-	} else if (!pte_huge(*kpte)) {
-		set_pte(kpte, pfn_pte(pfn, ref_prot));
-		BUG_ON(page_private(kpte_page) == 0);
-		page_private(kpte_page)--;
-	} else
-		BUG();
-
-	/* on x86-64 the direct mapping set at boot is not using 4k pages */
- 	BUG_ON(PageReserved(kpte_page));
-
-	save_page(kpte_page);
-	if (page_private(kpte_page) == 0)
-		revert_page(address, ref_prot);
-	return 0;
-} 
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * This should be used when a page is mapped with a different caching policy
- * than write-back somewhere - some CPUs do not like it when mappings with
- * different caching policies exist. This changes the page attributes of the
- * in kernel linear mapping too.
- * 
- * The caller needs to ensure that there are no conflicting mappings elsewhere.
- * This function only deals with the kernel linear map.
- * 
- * Caller must call global_flush_tlb() after this.
- */
-int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
-{
-	int err = 0, kernel_map = 0;
-	int i; 
-
-	if (address >= __START_KERNEL_map
-	    && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
-		address = (unsigned long)__va(__pa(address));
-		kernel_map = 1;
-	}
-
-	down_write(&init_mm.mmap_sem);
-	for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
-		unsigned long pfn = __pa(address) >> PAGE_SHIFT;
-
-		if (!kernel_map || pte_present(pfn_pte(0, prot))) {
-			err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
-			if (err)
-				break;
-		}
-		/* Handle kernel mapping too which aliases part of the
-		 * lowmem */
-		if (__pa(address) < KERNEL_TEXT_SIZE) {
-			unsigned long addr2;
-			pgprot_t prot2;
-			addr2 = __START_KERNEL_map + __pa(address);
-			/* Make sure the kernel mappings stay executable */
-			prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
-			err = __change_page_attr(addr2, pfn, prot2,
-						 PAGE_KERNEL_EXEC);
-		} 
-	} 	
-	up_write(&init_mm.mmap_sem); 
-	return err;
-}
-
-/* Don't call this for MMIO areas that may not have a mem_map entry */
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-	unsigned long addr = (unsigned long)page_address(page);
-	return change_page_attr_addr(addr, numpages, prot);
-}
-
-void global_flush_tlb(void)
-{ 
-	struct page *pg, *next;
-	struct list_head l;
-
-	/*
-	 * Write-protect the semaphore, to exclude two contexts
-	 * doing a list_replace_init() call in parallel and to
-	 * exclude new additions to the deferred_pages list:
-	 */
-	down_write(&init_mm.mmap_sem);
-	list_replace_init(&deferred_pages, &l);
-	up_write(&init_mm.mmap_sem);
-
-	flush_map(&l);
-
-	list_for_each_entry_safe(pg, next, &l, lru) {
-		list_del(&pg->lru);
-		clear_bit(PG_arch_1, &pg->flags);
-		if (page_private(pg) != 0)
-			continue;
-		ClearPagePrivate(pg);
-		__free_page(pg);
-	} 
-} 
-
-EXPORT_SYMBOL(change_page_attr);
-EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index be61a1d845a..2ae5999a795 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }
 
-void pmd_ctor(struct kmem_cache *cache, void *pmd)
-{
-	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
 /*
  * List of all pgd's needed for non-PAE so it can invalidate entries
  * in both cached and uncached pgd's; not needed for PAE since the
@@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd)
  * vmalloc faults work because attached pagetables are never freed.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
-struct page *pgd_list;
-
 static inline void pgd_list_add(pgd_t *pgd)
 {
 	struct page *page = virt_to_page(pgd);
-	page->index = (unsigned long)pgd_list;
-	if (pgd_list)
-		set_page_private(pgd_list, (unsigned long)&page->index);
-	pgd_list = page;
-	set_page_private(page, (unsigned long)&pgd_list);
+
+	list_add(&page->lru, &pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
-	struct page *next, **pprev, *page = virt_to_page(pgd);
-	next = (struct page *)page->index;
-	pprev = (struct page **)page_private(page);
-	*pprev = next;
-	if (next)
-		set_page_private(next, (unsigned long)pprev);
+	struct page *page = virt_to_page(pgd);
+
+	list_del(&page->lru);
 }
 
 
@@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd)
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
 	spin_lock_irqsave(&pgd_lock, flags);
 	pgd_list_del(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd)
 #define UNSHARED_PTRS_PER_PGD				\
 	(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
 
-/* If we allocate a pmd for part of the kernel address space, then
-   make sure its initialized with the appropriate kernel mappings.
-   Otherwise use a cached zeroed pmd.  */
-static pmd_t *pmd_cache_alloc(int idx)
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(pgd_t *pgdp)
 {
-	pmd_t *pmd;
+	int i;
 
-	if (idx >= USER_PTRS_PER_PGD) {
-		pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
+	for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+		pgd_t pgd = pgdp[i];
 
-		if (pmd)
-			memcpy(pmd,
-			       (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
+		if (pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = native_make_pgd(0);
+
+			paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(pmd);
+		}
+	}
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	pud_t *pud;
+	unsigned long addr;
+	int i;
+
+	pud = pud_offset(pgd, 0);
+ 	for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+	     i++, pud++, addr += PUD_SIZE) {
+		pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+		if (!pmd) {
+			pgd_mop_up_pmds(pgd);
+			return 0;
+		}
+
+		if (i >= USER_PTRS_PER_PGD)
+			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
 			       sizeof(pmd_t) * PTRS_PER_PMD);
-	} else
-		pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
 
-	return pmd;
+		pud_populate(mm, pud, pmd);
+	}
+
+	return 1;
+}
+#else  /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+	return 1;
 }
 
-static void pmd_cache_free(pmd_t *pmd, int idx)
+static void pgd_mop_up_pmds(pgd_t *pgd)
 {
-	if (idx >= USER_PTRS_PER_PGD)
-		free_page((unsigned long)pmd);
-	else
-		kmem_cache_free(pmd_cache, pmd);
 }
+#endif	/* CONFIG_X86_PAE */
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	int i;
 	pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
 
-	if (PTRS_PER_PMD == 1 || !pgd)
-		return pgd;
+	mm->pgd = pgd;		/* so that alloc_pd can use it */
 
- 	for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = pmd_cache_alloc(i);
-
-		if (!pmd)
-			goto out_oom;
-
-		paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
-		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+	if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+		quicklist_free(0, pgd_dtor, pgd);
+		pgd = NULL;
 	}
-	return pgd;
 
-out_oom:
-	for (i--; i >= 0; i--) {
-		pgd_t pgdent = pgd[i];
-		void* pmd = (void *)__va(pgd_val(pgdent)-1);
-		paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-		pmd_cache_free(pmd, i);
-	}
-	quicklist_free(0, pgd_dtor, pgd);
-	return NULL;
+	return pgd;
 }
 
 void pgd_free(pgd_t *pgd)
 {
-	int i;
-
-	/* in the PAE case user pgd entries are overwritten before usage */
-	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
-			pgd_t pgdent = pgd[i];
-			void* pmd = (void *)__va(pgd_val(pgdent)-1);
-			paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
-			pmd_cache_free(pmd, i);
-		}
-	/* in the non-PAE case, free_pgtables() clears user pgd entries */
+	pgd_mop_up_pmds(pgd);
 	quicklist_free(0, pgd_dtor, pgd);
 }
 
@@ -372,4 +376,3 @@ void check_pgt_cache(void)
 {
 	quicklist_trim(0, pgd_dtor, 25, 16);
 }
-
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index ea85172fc0c..65416f843e5 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 {
 	int pxm, node;
+	int apic_id;
+
+	apic_id = pa->apic_id;
 	if (srat_disabled())
 		return;
 	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		bad_srat();
 		return;
 	}
-	apicid_to_node[pa->apic_id] = node;
+	apicid_to_node[apic_id] = node;
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
-	       pxm, pa->apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-/*
- * Protect against too large hotadd areas that would fill up memory.
- */
-static int hotadd_enough_memory(struct bootnode *nd)
-{
-	static unsigned long allocated;
-	static unsigned long last_area_end;
-	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
-	long mem = pages * sizeof(struct page);
-	unsigned long addr;
-	unsigned long allowed;
-	unsigned long oldpages = pages;
-
-	if (mem < 0)
-		return 0;
-	allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
-	allowed = (allowed / 100) * hotadd_percent;
-	if (allocated + mem > allowed) {
-		unsigned long range;
-		/* Give them at least part of their hotadd memory upto hotadd_percent
-		   It would be better to spread the limit out
-		   over multiple hotplug areas, but that is too complicated
-		   right now */
-		if (allocated >= allowed)
-			return 0;
-		range = allowed - allocated;
-		pages = (range / PAGE_SIZE);
-		mem = pages * sizeof(struct page);
-		nd->end = nd->start + range;
-	}
-	/* Not completely fool proof, but a good sanity check */
-	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
-	if (addr == -1UL)
-		return 0;
-	if (pages != oldpages)
-		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
-			pages << PAGE_SHIFT);
-	last_area_end = addr + mem;
-	allocated += mem;
-	return 1;
-}
-
-static int update_end_of_memory(unsigned long end)
-{
-	found_add_area = 1;
-	if ((end >> PAGE_SHIFT) > end_pfn)
-		end_pfn = end >> PAGE_SHIFT;
-	return 1;
+	       pxm, apic_id, node);
 }
 
-static inline int save_add_info(void)
-{
-	return hotadd_percent > 0;
-}
-#else
 int update_end_of_memory(unsigned long end) {return -1;}
 static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
-#endif
 /*
  * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add infomation.
+ * Both SPARSE and RESERVE need nodes_add information.
  * This code supports one contiguous hot add area per node.
  */
 static int reserve_hotadd(int node, unsigned long start, unsigned long end)
@@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	return 1;
 }
 
-static void unparse_node(int node)
+static void __init unparse_node(int node)
 {
 	int i;
 	node_clear(node, nodes_parsed);
@@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cutoff_node(i, start, end);
-		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+		/*
+		 * don't confuse VM with a node that doesn't have the
+		 * minimum memory.
+		 */
+		if (nodes[i].end &&
+			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 			unparse_node(i);
 			node_set_offline(i);
 		}
@@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 
 	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_to_node(i) == NUMA_NO_NODE)
+		int node = early_cpu_to_node(i);
+
+		if (node == NUMA_NO_NODE)
 			continue;
-		if (!node_isset(cpu_to_node(i), node_possible_map))
+		if (!node_isset(node, node_possible_map))
 			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
@@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 }
 
 #ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+	[0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
 static int __init find_node_by_addr(unsigned long addr)
 {
 	int ret = NUMA_NO_NODE;
@@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr)
 			break;
 		}
 	}
-	return i;
+	return ret;
 }
 
 /*
@@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr)
 void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 {
 	int i, j;
-	int fake_node_to_pxm_map[MAX_NUMNODES] = {
-		[0 ... MAX_NUMNODES-1] = PXM_INVAL
-	};
-	unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
-		[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-	};
 
 	printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 			 "topology.\n");