9 files changed, 557 insertions, 190 deletions
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
index 1d232a87f11..d25ac86fe27 100644
--- a/arch/x86_64/mm/Makefile
+++ b/arch/x86_64/mm/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux x86_64-specific parts of the memory manager.
 #
 
-obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o
+obj-y	 := init.o fault.o ioremap.o extable.o pageattr.o mmap.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NUMA) += numa.o
 obj-$(CONFIG_K8_NUMA) += k8topology.o
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index b75b872ec15..26eac194064 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -35,6 +35,13 @@
 #include <asm-generic/sections.h>
 #include <asm/kdebug.h>
 
+/* Page fault error code bits */
+#define PF_PROT	(1<<0)		/* or no page found */
+#define PF_WRITE	(1<<1)
+#define PF_USER	(1<<2)
+#define PF_RSVD	(1<<3)
+#define PF_INSTR	(1<<4)
+
 void bust_spinlocks(int yes)
 {
 	int loglevel_save = console_loglevel;
@@ -68,7 +75,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
 	unsigned char *max_instr;
 
 	/* If it was a exec fault ignore */
-	if (error_code & (1<<4))
+	if (error_code & PF_INSTR)
 		return 0;
 	
 	instr = (unsigned char *)convert_rip_to_linear(current, regs);
@@ -222,17 +229,22 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 				 unsigned long error_code)
 {
 	unsigned long flags = oops_begin();
+	struct task_struct *tsk;
 
 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 	       current->comm, address);
 	dump_pagetable(address);
+	tsk = current;
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
 	__die("Bad pagetable", regs, error_code);
 	oops_end(flags);
 	do_exit(SIGKILL);
 }
 
 /*
- * Handle a fault on the vmalloc or module mapping area
+ * Handle a fault on the vmalloc area
  *
  * This assumes no large pages in there.
  */
@@ -278,7 +290,6 @@ static int vmalloc_fault(unsigned long address)
 	   that. */
 	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 		BUG();
-	__flush_tlb_all();
 	return 0;
 }
 
@@ -289,12 +300,6 @@ int exception_trace = 1;
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
  * routines.
- *
- * error_code:
- *	bit 0 == 0 means no page found, 1 means protection fault
- *	bit 1 == 0 means read, 1 means write
- *	bit 2 == 0 means kernel, 1 means user-mode
- *      bit 3 == 1 means fault was an instruction fetch
  */
 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 					unsigned long error_code)
@@ -308,18 +313,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	unsigned long flags;
 	siginfo_t info;
 
-#ifdef CONFIG_CHECKING
-	{ 
-		unsigned long gs; 
-		struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); 
-		rdmsrl(MSR_GS_BASE, gs); 
-		if (gs != (unsigned long)pda) { 
-			wrmsrl(MSR_GS_BASE, pda); 
-			printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
-		}
-	}
-#endif
-
 	/* get the address */
 	__asm__("movq %%cr2,%0":"=r" (address));
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
@@ -349,12 +342,16 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 *
 	 * This verifies that the fault happens in kernel space
 	 * (error_code & 4) == 0, and that the fault was not a
-	 * protection error (error_code & 1) == 0.
+	 * protection error (error_code & 9) == 0.
 	 */
 	if (unlikely(address >= TASK_SIZE64)) {
-		if (!(error_code & 5) &&
-		      ((address >= VMALLOC_START && address < VMALLOC_END) ||
-		       (address >= MODULES_VADDR && address < MODULES_END))) {
+		/*
+		 * Don't check for the module range here: its PML4
+		 * is always initialized because it's shared with the main
+		 * kernel text. Only vmalloc may need PML4 syncups.
+		 */
+		if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
+		      ((address >= VMALLOC_START && address < VMALLOC_END))) {
 			if (vmalloc_fault(address) < 0)
 				goto bad_area_nosemaphore;
 			return;
@@ -366,7 +363,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area_nosemaphore;
 	}
 
-	if (unlikely(error_code & (1 << 3)))
+	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(address, regs, error_code);
 
 	/*
@@ -393,7 +390,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	 * thus avoiding the deadlock.
 	 */
 	if (!down_read_trylock(&mm->mmap_sem)) {
-		if ((error_code & 4) == 0 &&
+		if ((error_code & PF_USER) == 0 &&
 		    !search_exception_tables(regs->rip))
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
@@ -420,17 +417,17 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 good_area:
 	info.si_code = SEGV_ACCERR;
 	write = 0;
-	switch (error_code & 3) {
+	switch (error_code & (PF_PROT|PF_WRITE)) {
 		default:	/* 3: write, present */
 			/* fall through */
-		case 2:		/* write, not present */
+		case PF_WRITE:		/* write, not present */
 			if (!(vma->vm_flags & VM_WRITE))
 				goto bad_area;
 			write++;
 			break;
-		case 1:		/* read, present */
+		case PF_PROT:		/* read, present */
 			goto bad_area;
-		case 0:		/* read, not present */
+		case 0:			/* read, not present */
 			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
 				goto bad_area;
 	}
@@ -465,7 +462,7 @@ bad_area:
 
 bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & 4) {
+	if (error_code & PF_USER) {
 		if (is_prefetch(regs, address, error_code))
 			return;
 
@@ -533,6 +530,9 @@ no_context:
 	printk_address(regs->rip);
 	printk("\n");
 	dump_pagetable(address);
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
 	__die("Oops", regs, error_code);
 	/* Executive summary in case the body of the oops scrolled away */
 	printk(KERN_EMERG "CR2: %016lx\n", address);
@@ -558,7 +558,7 @@ do_sigbus:
 	up_read(&mm->mmap_sem);
 
 	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & 4))
+	if (!(error_code & PF_USER))
 		goto no_context;
 
 	tsk->thread.cr2 = address;
@@ -571,3 +571,10 @@ do_sigbus:
 	force_sig_info(SIGBUS, &info, tsk);
 	return;
 }
+
+static int __init enable_pagefaulttrace(char *str)
+{
+	page_fault_trace = 1;
+	return 0;
+}
+__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index e60a1a848de..7af1742aa95 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,6 +22,10 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory_hotplug.h>
 
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -36,16 +40,18 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/dma-mapping.h>
+#include <asm/swiotlb.h>
 
 #ifndef Dprintk
 #define Dprintk(x...)
 #endif
 
-#ifdef CONFIG_GART_IOMMU
-extern int swiotlb;
-#endif
+struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
 
-extern char _stext[];
+static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
@@ -86,9 +92,6 @@ void show_mem(void)
 
 /* References to section boundaries */
 
-extern char _text, _etext, _edata, __bss_start, _end[];
-extern char __init_begin, __init_end;
-
 int after_bootmem;
 
 static void *spp_getpage(void)
@@ -179,13 +182,19 @@ static  struct temp_map {
 	{}
 }; 
 
-static __init void *alloc_low_page(int *index, unsigned long *phys) 
+static __meminit void *alloc_low_page(int *index, unsigned long *phys)
 { 
 	struct temp_map *ti;
 	int i; 
 	unsigned long pfn = table_end++, paddr; 
 	void *adr;
 
+	if (after_bootmem) {
+		adr = (void *)get_zeroed_page(GFP_ATOMIC);
+		*phys = __pa(adr);
+		return adr;
+	}
+
 	if (pfn >= end_pfn) 
 		panic("alloc_low_page: ran out of memory"); 
 	for (i = 0; temp_mappings[i].allocated; i++) {
@@ -198,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys)
 	ti->allocated = 1; 
 	__flush_tlb(); 	       
 	adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); 
+	memset(adr, 0, PAGE_SIZE);
 	*index = i; 
 	*phys  = pfn * PAGE_SIZE;  
 	return adr; 
 } 
 
-static __init void unmap_low_page(int i)
+static __meminit void unmap_low_page(int i)
 { 
-	struct temp_map *ti = &temp_mappings[i];
+	struct temp_map *ti;
+
+	if (after_bootmem)
+		return;
+
+	ti = &temp_mappings[i];
 	set_pmd(ti->pmd, __pmd(0));
 	ti->allocated = 0; 
 } 
 
-static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit
+phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+		unsigned long entry;
+
+		if (address > end) {
+			for (; i < PTRS_PER_PMD; i++, pmd++)
+				set_pmd(pmd, __pmd(0));
+			break;
+		}
+		entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+		entry &= __supported_pte_mask;
+		set_pmd(pmd, __pmd(entry));
+	}
+}
+
+static void __meminit
+phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
+
+	if (pmd_none(*pmd)) {
+		spin_lock(&init_mm.page_table_lock);
+		phys_pmd_init(pmd, address, end);
+		spin_unlock(&init_mm.page_table_lock);
+		__flush_tlb_all();
+	}
+}
+
+static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 { 
-	long i, j; 
+	long i = pud_index(address);
 
-	i = pud_index(address);
 	pud = pud + i;
+
+	if (after_bootmem && pud_val(*pud)) {
+		phys_pmd_update(pud, address, end);
+		return;
+	}
+
 	for (; i < PTRS_PER_PUD; pud++, i++) {
 		int map; 
 		unsigned long paddr, pmd_phys;
 		pmd_t *pmd;
 
-		paddr = address + i*PUD_SIZE;
-		if (paddr >= end) { 
-			for (; i < PTRS_PER_PUD; i++, pud++) 
-				set_pud(pud, __pud(0)); 
+		paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
+		if (paddr >= end)
 			break;
-		} 
 
-		if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { 
+		if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
 			set_pud(pud, __pud(0)); 
 			continue;
 		} 
 
 		pmd = alloc_low_page(&map, &pmd_phys);
+		spin_lock(&init_mm.page_table_lock);
 		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
-		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-			unsigned long pe;
-
-			if (paddr >= end) { 
-				for (; j < PTRS_PER_PMD; j++, pmd++)
-					set_pmd(pmd,  __pmd(0)); 
-				break;
-		}
-			pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
-			pe &= __supported_pte_mask;
-			set_pmd(pmd, __pmd(pe));
-		}
+		phys_pmd_init(pmd, paddr, end);
+		spin_unlock(&init_mm.page_table_lock);
 		unmap_low_page(map);
 	}
 	__flush_tlb();
@@ -254,25 +294,32 @@ static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned lon
 
 static void __init find_early_table_space(unsigned long end)
 {
-	unsigned long puds, pmds, tables;
+	unsigned long puds, pmds, tables, start;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
 	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
 	tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
 		 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
 
-	table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
+ 	/* RED-PEN putting page tables only on node 0 could
+ 	   cause a hotspot and fill up ZONE_DMA. The page tables
+ 	   need roughly 0.5KB per GB. */
+ 	start = 0x8000;
+ 	table_start = find_e820_area(start, end, tables);
 	if (table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
 	table_start >>= PAGE_SHIFT;
 	table_end = table_start;
+
+	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
 }
 
 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
    This runs before bootmem is initialized and gets pages directly from the 
    physical memory. To access them they are temporarily mapped. */
-void __init init_memory_mapping(unsigned long start, unsigned long end)
+void __meminit init_memory_mapping(unsigned long start, unsigned long end)
 { 
 	unsigned long next; 
 
@@ -284,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
 	 * mapped.  Unfortunately this is done currently before the nodes are 
 	 * discovered.
 	 */
-	find_early_table_space(end);
+	if (!after_bootmem)
+		find_early_table_space(end);
 
 	start = (unsigned long)__va(start);
 	end = (unsigned long)__va(end);
@@ -292,58 +340,106 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
 	for (; start < end; start = next) {
 		int map;
 		unsigned long pud_phys; 
-		pud_t *pud = alloc_low_page(&map, &pud_phys);
+		pgd_t *pgd = pgd_offset_k(start);
+		pud_t *pud;
+
+		if (after_bootmem)
+			pud = pud_offset_k(pgd, __PAGE_OFFSET);
+		else
+			pud = alloc_low_page(&map, &pud_phys);
+
 		next = start + PGDIR_SIZE;
 		if (next > end) 
 			next = end; 
 		phys_pud_init(pud, __pa(start), __pa(next));
-		set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
+		if (!after_bootmem)
+			set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
 		unmap_low_page(map);   
 	} 
 
-	asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+	if (!after_bootmem)
+		asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
 	__flush_tlb_all();
-	early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
-	       table_start<<PAGE_SHIFT, 
-	       table_end<<PAGE_SHIFT);
 }
 
-extern struct x8664_pda cpu_pda[NR_CPUS];
+void __cpuinit zap_low_mappings(int cpu)
+{
+	if (cpu == 0) {
+		pgd_t *pgd = pgd_offset_k(0UL);
+		pgd_clear(pgd);
+	} else {
+		/*
+		 * For AP's, zap the low identity mappings by changing the cr3
+		 * to init_level4_pgt and doing local flush tlb all
+		 */
+		asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
+	}
+	__flush_tlb_all();
+}
 
-/* Assumes all CPUs still execute in init_mm */
-void zap_low_mappings(void)
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long *h,
+	   unsigned long start_pfn, unsigned long end_pfn)
 {
-	pgd_t *pgd = pgd_offset_k(0UL);
-	pgd_clear(pgd);
-	flush_tlb_all();
+ 	int i;
+ 	unsigned long w;
+
+ 	for (i = 0; i < MAX_NR_ZONES; i++)
+ 		z[i] = 0;
+
+ 	if (start_pfn < MAX_DMA_PFN)
+ 		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+ 	if (start_pfn < MAX_DMA32_PFN) {
+ 		unsigned long dma32_pfn = MAX_DMA32_PFN;
+ 		if (dma32_pfn > end_pfn)
+ 			dma32_pfn = end_pfn;
+ 		z[ZONE_DMA32] = dma32_pfn - start_pfn;
+ 	}
+ 	z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+ 	/* Remove lower zones from higher ones. */
+ 	w = 0;
+ 	for (i = 0; i < MAX_NR_ZONES; i++) {
+ 		if (z[i])
+ 			z[i] -= w;
+ 	        w += z[i];
+	}
+
+	/* Compute holes */
+	w = start_pfn;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		unsigned long s = w;
+		w += z[i];
+		h[i] = e820_hole_size(s, w);
+	}
+
+	/* Add the space pace needed for mem_map to the holes too. */
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
+
+	/* The 16MB DMA zone has the kernel and other misc mappings.
+ 	   Account them too */
+	if (h[ZONE_DMA]) {
+		h[ZONE_DMA] += dma_reserve;
+		if (h[ZONE_DMA] >= z[ZONE_DMA]) {
+			printk(KERN_WARNING
+				"Kernel too large and filling up ZONE_DMA?\n");
+			h[ZONE_DMA] = z[ZONE_DMA];
+		}
+	}
 }
 
 #ifndef CONFIG_NUMA
 void __init paging_init(void)
 {
-	{
-		unsigned long zones_size[MAX_NR_ZONES];
-		unsigned long holes[MAX_NR_ZONES];
-		unsigned int max_dma;
-
-		memset(zones_size, 0, sizeof(zones_size));
-		memset(holes, 0, sizeof(holes));
-
-		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-		if (end_pfn < max_dma) {
-			zones_size[ZONE_DMA] = end_pfn;
-			holes[ZONE_DMA] = e820_hole_size(0, end_pfn);
-		} else {
-			zones_size[ZONE_DMA] = max_dma;
-			holes[ZONE_DMA] = e820_hole_size(0, max_dma);
-			zones_size[ZONE_NORMAL] = end_pfn - max_dma;
-			holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn);
-		}
-		free_area_init_node(0, NODE_DATA(0), zones_size,
-                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
-	}
-	return;
+	unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
+
+	memory_present(0, 0, end_pfn);
+	sparse_init();
+	size_zones(zones, holes, 0, end_pfn);
+	free_area_init_node(0, NODE_DATA(0), zones,
+			    __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
 }
 #endif
 
@@ -381,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 	__flush_tlb_all();
 } 
 
+/*
+ * Memory hotplug specific functions
+ * These are only for non-NUMA machines right now.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	set_page_count(page, 1);
+	__free_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+int add_memory(u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(0);
+	struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	ret = __add_pages(zone, start_pfn, nr_pages);
+	if (ret)
+		goto error;
+
+	init_memory_mapping(start, (start + size -1));
+
+	return ret;
+error:
+	printk("%s: Problem encountered in __add_pages!\n", __func__);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+
+#endif
+
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
 
@@ -389,12 +529,9 @@ void __init mem_init(void)
 	long codesize, reservedpages, datasize, initsize;
 
 #ifdef CONFIG_SWIOTLB
-	if (!iommu_aperture &&
-	    (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
-	       swiotlb = 1;
-	if (swiotlb)
-		swiotlb_init();	
+	pci_swiotlb_init();
 #endif
+	no_iommu_init();
 
 	/* How many end-of-memory variables you have, grandma! */
 	max_low_pfn = end_pfn;
@@ -438,19 +575,16 @@ void __init mem_init(void)
 		datasize >> 10,
 		initsize >> 10);
 
+#ifdef CONFIG_SMP
 	/*
-	 * Subtle. SMP is doing its boot stuff late (because it has to
-	 * fork idle threads) - but it also needs low mappings for the
-	 * protected-mode entry to work. We zap these entries only after
-	 * the WP-bit has been tested.
+	 * Sync boot_level4_pgt mappings with the init_level4_pgt
+	 * except for the low identity mappings which are already zapped
+	 * in init_level4_pgt. This sync-up is essential for AP's bringup
 	 */
-#ifndef CONFIG_SMP
-	zap_low_mappings();
+	memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
 #endif
 }
 
-extern char __initdata_begin[], __initdata_end[];
-
 void free_initmem(void)
 {
 	unsigned long addr;
@@ -464,13 +598,36 @@ void free_initmem(void)
 		totalram_pages++;
 	}
 	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
-	printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10);
+	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
 }
 
+#ifdef CONFIG_DEBUG_RODATA
+
+extern char __start_rodata, __end_rodata;
+void mark_rodata_ro(void)
+{
+	unsigned long addr = (unsigned long)&__start_rodata;
+
+	for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+		change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
+
+	printk ("Write protecting the kernel read-only data: %luk\n",
+			(&__end_rodata - &__start_rodata) >> 10);
+
+	/*
+	 * change_page_attr_addr() requires a global_flush_tlb() call after it.
+	 * We do this after the printk so that if something went wrong in the
+	 * change, the printk gets out at least to give a better debug hint
+	 * of who is the culprit.
+	 */
+	global_flush_tlb();
+}
+#endif
+
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	if (start < (unsigned long)&_end)
+	if (start >= end)
 		return;
 	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
@@ -491,6 +648,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
 #else       		
 	reserve_bootmem(phys, len);    
 #endif
+	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+		dma_reserve += len / PAGE_SIZE;
 }
 
 int kern_addr_valid(unsigned long addr) 
@@ -532,10 +691,6 @@ extern int exception_trace, page_fault_trace;
 static ctl_table debug_table2[] = {
 	{ 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
 	  proc_dointvec },
-#ifdef CONFIG_CHECKING
-	{ 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
-	  proc_dointvec },
-#endif
 	{ 0, }
 }; 
 
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index ecf7acb5db9..ae207064201 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -247,9 +247,15 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
 	return __ioremap(phys_addr, size, _PAGE_PCD);
 }
 
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
 void iounmap(volatile void __iomem *addr)
 {
-	struct vm_struct *p;
+	struct vm_struct *p, *o;
 
 	if (addr <= high_memory) 
 		return; 
@@ -257,12 +263,31 @@ void iounmap(volatile void __iomem *addr)
 		addr < phys_to_virt(ISA_END_ADDRESS))
 		return;
 
-	write_lock(&vmlist_lock);
-	p = __remove_vm_area((void *)((unsigned long)addr & PAGE_MASK));
-	if (!p)
+	addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr);
+	/* Use the vm area unlocked, assuming the caller
+	   ensures there isn't another iounmap for the same address
+	   in parallel. Reuse of the virtual address is prevented by
+	   leaving it in the global lists until we're done with it.
+	   cpa takes care of the direct mappings. */
+	read_lock(&vmlist_lock);
+	for (p = vmlist; p; p = p->next) {
+		if (p->addr == addr)
+			break;
+	}
+	read_unlock(&vmlist_lock);
+
+	if (!p) {
 		printk("iounmap: bad address %p\n", addr);
-	else if (p->flags >> 20)
+		dump_stack();
+		return;
+	}
+
+	/* Reset the direct mapping. Can block */
+	if (p->flags >> 20)
 		ioremap_change_attr(p->phys_addr, p->size, 0);
-	write_unlock(&vmlist_lock);
+
+	/* Finally remove it */
+	o = remove_vm_area((void *)addr);
+	BUG_ON(p != o || o == NULL);
 	kfree(p); 
 }
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 65417b040c1..a5663e0bb01 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -108,6 +108,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
 		limit >>= 16; 
 		limit <<= 24; 
 		limit |= (1<<24)-1;
+		limit++;
 
 		if (limit > end_pfn << PAGE_SHIFT)
 			limit = end_pfn << PAGE_SHIFT;
diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c
new file mode 100644
index 00000000000..43e9b99bdf2
--- /dev/null
+++ b/arch/x86_64/mm/mmap.c
@@ -0,0 +1,30 @@
+/* Copyright 2005 Andi Kleen, SuSE Labs.
+ * Licensed under GPL, v.2
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <asm/ia32.h>
+
+/* Notebook: move the mmap code from sys_x86_64.c over here. */
+
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (current_thread_info()->flags & _TIF_IA32)
+		return ia32_pick_mmap_layout(mm);
+#endif
+	mm->mmap_base = TASK_UNMAPPED_BASE;
+	if (current->flags & PF_RANDOMIZE) {
+		/* Add 28bit randomness which is about 40bits of address space
+		   because mmap base has to be page aligned.
+ 		   or ~1/128 of the total user VM
+	   	   (total user address space is 47bits) */
+		unsigned rnd = get_random_int() & 0xfffffff;
+		mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT;
+	}
+	mm->get_unmapped_area = arch_get_unmapped_area;
+	mm->unmap_area = arch_unmap_area;
+}
+
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 21480382100..6ef9f9a7623 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -38,38 +38,59 @@ cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
-int __init compute_hash_shift(struct node *nodes, int numnodes)
+
+/*
+ * Given a shift value, try to populate memnodemap[]
+ * Returns :
+ * 1 if OK
+ * 0 if memnodmap[] too small (of shift too small)
+ * -1 if node overlap or lost ram (shift too big)
+ */
+static int __init
+populate_memnodemap(const struct node *nodes, int numnodes, int shift)
 {
 	int i; 
-	int shift = 20;
-	unsigned long addr,maxend=0;
-	
-	for (i = 0; i < numnodes; i++)
-		if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
-				maxend = nodes[i].end;
+	int res = -1;
+	unsigned long addr, end;
 
-	while ((1UL << shift) <  (maxend / NODEMAPSIZE))
-		shift++;
-
-	printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
-			shift,maxend);
-	memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
+	if (shift >= 64)
+		return -1;
+	memset(memnodemap, 0xff, sizeof(memnodemap));
 	for (i = 0; i < numnodes; i++) {
-		if (nodes[i].start == nodes[i].end)
+		addr = nodes[i].start;
+		end = nodes[i].end;
+		if (addr >= end)
 			continue;
-		for (addr = nodes[i].start;
-		     addr < nodes[i].end;
-		     addr += (1UL << shift)) {
-			if (memnodemap[addr >> shift] != 0xff) {
-				printk(KERN_INFO
-	"Your memory is not aligned you need to rebuild your kernel "
-	"with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
-					shift,addr);
+		if ((end >> shift) >= NODEMAPSIZE)
+			return 0;
+		do {
+			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
-			} 
 			memnodemap[addr >> shift] = i;
-		} 
+                       addr += (1UL << shift);
+		} while (addr < end);
+		res = 1;
 	} 
+	return res;
+}
+
+int __init compute_hash_shift(struct node *nodes, int numnodes)
+{
+	int shift = 20;
+
+	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
+		shift++;
+
+	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
+		shift);
+
+	if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+		printk(KERN_INFO
+	"Your memory is not aligned you need to rebuild your kernel "
+	"with a bigger NODEMAPSIZE shift=%d\n",
+			shift);
+		return -1;
+	}
 	return shift;
 }
 
@@ -89,12 +110,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 
 	start = round_up(start, ZONE_ALIGN); 
 
-	printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
+	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
 
 	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
 
-	memory_present(nodeid, start_pfn, end_pfn);
 	nodedata_phys = find_e820_area(start, end, pgdat_size); 
 	if (nodedata_phys == -1L) 
 		panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -132,29 +152,14 @@ void __init setup_node_zones(int nodeid)
 	unsigned long start_pfn, end_pfn; 
 	unsigned long zones[MAX_NR_ZONES];
 	unsigned long holes[MAX_NR_ZONES];
-	unsigned long dma_end_pfn;
 
-	memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
-	memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
+ 	start_pfn = node_start_pfn(nodeid);
+ 	end_pfn = node_end_pfn(nodeid);
 
-	start_pfn = node_start_pfn(nodeid);
-	end_pfn = node_end_pfn(nodeid);
+	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
+		nodeid, start_pfn, end_pfn);
 
-	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
-	
-	/* All nodes > 0 have a zero length zone DMA */ 
-	dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
-	if (start_pfn < dma_end_pfn) { 
-		zones[ZONE_DMA] = dma_end_pfn - start_pfn;
-		holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
-		zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
-		holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
-
-	} else { 
-		zones[ZONE_NORMAL] = end_pfn - start_pfn; 
-		holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
-	} 
-    
+	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, holes);
 } 
@@ -171,7 +176,7 @@ void __init numa_init_array(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_to_node[i] != NUMA_NO_NODE)
 			continue;
-		cpu_to_node[i] = rr;
+ 		numa_set_node(i, rr);
 		rr = next_node(rr, node_online_map);
 		if (rr == MAX_NUMNODES)
 			rr = first_node(node_online_map);
@@ -195,7 +200,7 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  		while ((x << 1) < sz)
  			x <<= 1;
  		if (x < sz/2)
- 			printk("Numa emulation unbalanced. Complain to maintainer\n");
+ 			printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
  		sz = x;
  	}
 
@@ -205,8 +210,6 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
  		if (i == numa_fake-1)
  			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
  		nodes[i].end = nodes[i].start + sz;
- 		if (i != numa_fake-1)
- 			nodes[i].end--;
  		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  		       i,
  		       nodes[i].start, nodes[i].end,
@@ -257,7 +260,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	nodes_clear(node_online_map);
 	node_set_online(0);
 	for (i = 0; i < NR_CPUS; i++)
-		cpu_to_node[i] = 0;
+		numa_set_node(i, 0);
 	node_to_cpumask[0] = cpumask_of_cpu(0);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
@@ -267,6 +270,12 @@ __cpuinit void numa_add_cpu(int cpu)
 	set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
 } 
 
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	cpu_pda(cpu)->nodenumber = node;
+	cpu_to_node[cpu] = node;
+}
+
 unsigned long __init numa_free_all_bootmem(void) 
 { 
 	int i;
@@ -277,9 +286,26 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 } 
 
+#ifdef CONFIG_SPARSEMEM
+static void __init arch_sparse_init(void)
+{
+	int i;
+
+	for_each_online_node(i)
+		memory_present(i, node_start_pfn(i), node_end_pfn(i));
+
+	sparse_init();
+}
+#else
+#define arch_sparse_init() do {} while (0)
+#endif
+
 void __init paging_init(void)
 { 
 	int i;
+
+	arch_sparse_init();
+
 	for_each_online_node(i) {
 		setup_node_zones(i); 
 	}
@@ -304,8 +330,69 @@ __init int numa_setup(char *opt)
 	return 1;
 } 
 
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ */
+void __init init_cpu_to_node(void)
+{
+	int i;
+ 	for (i = 0; i < NR_CPUS; i++) {
+		u8 apicid = x86_cpu_to_apicid[i];
+		if (apicid == BAD_APICID)
+			continue;
+		if (apicid_to_node[apicid] == NUMA_NO_NODE)
+			continue;
+ 		cpu_to_node[i] = apicid_to_node[apicid];
+	}
+}
+
 EXPORT_SYMBOL(cpu_to_node);
 EXPORT_SYMBOL(node_to_cpumask);
 EXPORT_SYMBOL(memnode_shift);
 EXPORT_SYMBOL(memnodemap);
 EXPORT_SYMBOL(node_data);
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * Functions to convert PFNs from/to per node page addresses.
+ * These are out of line because they are quite big.
+ * They could be all tuned by pre caching more state.
+ * Should do that.
+ */
+
+/* Requires pfn_valid(pfn) to be true */
+struct page *pfn_to_page(unsigned long pfn)
+{
+	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
+	return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
+}
+EXPORT_SYMBOL(pfn_to_page);
+
+unsigned long page_to_pfn(struct page *page)
+{
+	return (long)(((page) - page_zone(page)->zone_mem_map) +
+		      page_zone(page)->zone_start_pfn);
+}
+EXPORT_SYMBOL(page_to_pfn);
+
+int pfn_valid(unsigned long pfn)
+{
+	unsigned nid;
+	if (pfn >= num_physpages)
+		return 0;
+	nid = pfn_to_nid(pfn);
+	if (nid == 0xff)
+		return 0;
+	return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
+}
+EXPORT_SYMBOL(pfn_valid);
+#endif
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index b90e8fe9eeb..35f1f1aab06 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
 	pte_t *kpte; 
 	struct page *kpte_page;
 	unsigned kpte_flags;
+	pgprot_t ref_prot2;
 	kpte = lookup_address(address);
 	if (!kpte) return 0;
 	kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
  			 * split_large_page will take the reference for this change_page_attr
  			 * on the split page.
  			 */
-			struct page *split = split_large_page(address, prot, ref_prot); 
+
+			struct page *split;
+			ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
+
+			split = split_large_page(address, prot, ref_prot2);
 			if (!split)
 				return -ENOMEM;
-			set_pte(kpte,mk_pte(split, ref_prot));
+			set_pte(kpte,mk_pte(split, ref_prot2));
 			kpte_page = split;
 		}	
 		get_page(kpte_page);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 4b2e844c15a..8b7f85608fa 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -17,21 +17,23 @@
 #include <linux/topology.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
+#include <asm/e820.h>
 
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
 static nodemask_t nodes_found __initdata;
 static struct node nodes[MAX_NUMNODES] __initdata;
-static __u8  pxm2node[256] = { [0 ... 255] = 0xff };
+static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 static int node_to_pxm(int n);
 
 int pxm_to_node(int pxm)
 {
 	if ((unsigned)pxm >= 256)
-		return 0;
-	return pxm2node[pxm];
+		return -1;
+	/* Extend 0xff to (int)-1 */
+	return (signed char)pxm2node[pxm];
 }
 
 static __init int setup_node(int pxm)
@@ -71,8 +73,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 			nd->start = nd->end;
 	}
 	if (nd->end > end) {
-		if (!(end & 0xfff))
-			end--;
 		nd->end = end;
 		if (nd->start > nd->end)
 			nd->start = nd->end;
@@ -93,9 +93,36 @@ static __init inline int srat_disabled(void)
 	return numa_off || acpi_numa < 0;
 }
 
+/*
+ * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
+ * up the NUMA heuristics which wants the local node to have a smaller
+ * distance than the others.
+ * Do some quick checks here and only use the SLIT if it passes.
+ */
+static __init int slit_valid(struct acpi_table_slit *slit)
+{
+	int i, j;
+	int d = slit->localities;
+	for (i = 0; i < d; i++) {
+		for (j = 0; j < d; j++)  {
+			u8 val = slit->entry[d*i + j];
+			if (i == j) {
+				if (val != 10)
+					return 0;
+			} else if (val <= 10)
+				return 0;
+		}
+	}
+	return 1;
+}
+
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
+	if (!slit_valid(slit)) {
+		printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
+		return;
+	}
 	acpi_slit = slit;
 }
 
@@ -166,18 +193,43 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		if (nd->end < end)
 			nd->end = end;
 	}
-	if (!(nd->end & 0xfff))
-		nd->end--;
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
 }
 
+/* Sanity check to catch more bad SRATs (they are amazingly common).
+   Make sure the PXMs cover all memory. */
+static int nodes_cover_memory(void)
+{
+	int i;
+	unsigned long pxmram, e820ram;
+
+	pxmram = 0;
+	for_each_node_mask(i, nodes_parsed) {
+		unsigned long s = nodes[i].start >> PAGE_SHIFT;
+		unsigned long e = nodes[i].end >> PAGE_SHIFT;
+		pxmram += e - s;
+		pxmram -= e820_hole_size(s, e);
+	}
+
+	e820ram = end_pfn - e820_hole_size(0, end_pfn);
+	if (pxmram < e820ram) {
+		printk(KERN_ERR
+	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
+			(pxmram << PAGE_SHIFT) >> 20,
+			(e820ram << PAGE_SHIFT) >> 20);
+		return 0;
+	}
+	return 1;
+}
+
 void __init acpi_numa_arch_fixup(void) {}
 
 /* Use the information discovered above to actually set up the nodes. */
 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 {
 	int i;
+
 	if (acpi_numa <= 0)
 		return -1;
 
@@ -188,6 +240,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 			node_clear(i, nodes_parsed);
 	}
 
+	if (!nodes_cover_memory()) {
+		bad_srat();
+		return -1;
+	}
+
 	memnode_shift = compute_hash_shift(nodes, nodes_weight(nodes_parsed));
 	if (memnode_shift < 0) {
 		printk(KERN_ERR
@@ -203,7 +260,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		if (cpu_to_node[i] == NUMA_NO_NODE)
 			continue;
 		if (!node_isset(cpu_to_node[i], nodes_parsed))
-			cpu_to_node[i] = NUMA_NO_NODE; 
+			numa_set_node(i, NUMA_NO_NODE);
 	}
 	numa_init_array();
 	return 0;