From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 22 Apr 2008 16:38:23 -0500
Subject: stackprotector: use canary at end of stack to indicate overruns at
 oops time

(Updated with a common max-stack-used checker that knows about
the canary, as suggested by Joe Perches)

Use a canary at the end of the stack to clearly indicate
at oops time whether the stack has ever overflowed.

This is a very simple implementation with a couple of
drawbacks:

1) a thread may legitimately use exactly up to the last
   word on the stack

 -- but the chances of doing this and then oopsing later seem slim

2) it's possible that the stack usage isn't dense enough
   that the canary location could get skipped over

 -- but the worst that happens is that we don't flag the overrun
 -- though this happens fairly often in my testing :(

With the code in place, an intentionally-bloated stack oops might
do:

BUG: unable to handle kernel paging request at ffff8103f84cc680
IP: [<ffffffff810253df>] update_curr+0x9a/0xa8
PGD 8063 PUD 0
Thread overran stack or stack corrupted
Oops: 0000 [1] SMP
CPU 0
...

... unless the stack overrun is so bad that it corrupts some other
thread.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/mm')
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75..1f524df68b9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	unsigned long address;
 	int write, si_code;
 	int fault;
+	unsigned long *stackend;
+
 #ifdef CONFIG_X86_64
 	unsigned long flags;
 #endif
@@ -850,6 +853,10 @@ no_context:
 
 	show_fault_oops(regs, error_code, address);
 
+ 	stackend = end_of_stack(tsk);
+	if (*stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
-- 
cgit v1.2.3


From dacf7333571d770366bff74d10b56aa545434605 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Wed, 7 Jan 2009 17:26:35 +0530
Subject: x86: smp.h move zap_low_mappings declartion to tlbflush.h

Impact: cleanup, moving NON-SMP stuff from smp.h

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f99a6c6c432..a9dd0b7ad61 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,7 +49,6 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
-#include <asm/smp.h>
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-- 
cgit v1.2.3


From 5cca0cf15a94417f49625ce52e23589eed0a1675 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Fri, 9 Jan 2009 14:35:20 -0800
Subject: x86, pat: fix reserve_memtype() for legacy 1MB range

Thierry Vignaud reported:
> http://bugzilla.kernel.org/show_bug.cgi?id=12372
>
> On P4 with an SiS motherboard (video card is a SiS 651)
> X server fails to start with error:
> xf86MapVidMem: Could not mmap framebuffer (0x00000000,0x2000) (Invalid
> argument)

Here X is trying to map first 8KB of memory using /dev/mem. Existing
code treats first 0-4KB of memory as non-RAM and 4KB-8KB as RAM. Recent
code changes don't allow to map memory with different attributes
at the same time.

Fix this by treating the first 1MB legacy region as special and always
track the attribute requests with in this region using linear linked
list (and don't bother if the range is RAM or non-RAM or mixed)

Reported-and-tested-by: Thierry Vignaud <tvignaud@mandriva.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 85cbd3cd372..070ee4a3b22 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -333,11 +333,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 					      req_type & _PAGE_CACHE_MASK);
 	}
 
-	is_range_ram = pagerange_is_ram(start, end);
-	if (is_range_ram == 1)
-		return reserve_ram_pages_type(start, end, req_type, new_type);
-	else if (is_range_ram < 0)
-		return -EINVAL;
+	/*
+	 * For legacy reasons, some parts of the physical address range in the
+	 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
+	 * the e820 tables).  So we will track the memory attributes of this
+	 * legacy 1MB region using the linear memtype_list always.
+	 */
+	if (end >= ISA_END_ADDRESS) {
+		is_range_ram = pagerange_is_ram(start, end);
+		if (is_range_ram == 1)
+			return reserve_ram_pages_type(start, end, req_type,
+						      new_type);
+		else if (is_range_ram < 0)
+			return -EINVAL;
+	}
 
 	new  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
 	if (!new)
@@ -437,11 +446,19 @@ int free_memtype(u64 start, u64 end)
 	if (is_ISA_range(start, end - 1))
 		return 0;
 
-	is_range_ram = pagerange_is_ram(start, end);
-	if (is_range_ram == 1)
-		return free_ram_pages_type(start, end);
-	else if (is_range_ram < 0)
-		return -EINVAL;
+	/*
+	 * For legacy reasons, some parts of the physical address range in the
+	 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
+	 * the e820 tables).  So we will track the memory attributes of this
+	 * legacy 1MB region using the linear memtype_list always.
+	 */
+	if (end >= ISA_END_ADDRESS) {
+		is_range_ram = pagerange_is_ram(start, end);
+		if (is_range_ram == 1)
+			return free_ram_pages_type(start, end);
+		else if (is_range_ram < 0)
+			return -EINVAL;
+	}
 
 	spin_lock(&memtype_lock);
 	list_for_each_entry(entry, &memtype_list, nd) {
-- 
cgit v1.2.3


From a3c6018e565dc07cf3738ace6bbe412f97b1bba8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 16 Jan 2009 11:59:33 +0000
Subject: x86: fix assumed to be contiguous leaf page tables for kmap_atomic
 region (take 2)

Debugging and original patch from Nick Piggin <npiggin@suse.de>

The early fixmap pmd entry inserted at the very top of the KVA is causing the
subsequent fixmap mapping code to not provide physically linear pte pages over
the kmap atomic portion of the fixmap (which relies on said property to
calculate pte addresses).

This has caused weird boot failures in kmap_atomic much later in the boot
process (initial userspace faults) on a 32-bit PAE system with a larger number
of CPUs (smaller CPU counts tend not to run over into the next page so don't
show up the problem).

Solve this by attempting to clear out the page table, and copy any of its
entries to the new one. Also, add a bug if a nonlinear condition is encountered
and can't be resolved, which might save some hours of debugging if this fragile
scheme ever breaks again...

Once we have such logic, we can also use it to eliminate the early ioremap
trickery around the page table setup for the fixmap area. This also fixes
potential issues with FIX_* entries sharing the leaf page table with the early
ioremap ones getting discarded by early_ioremap_clear() and not restored by
early_ioremap_reset(). It at once eliminates the temporary (and configuration,
namely NR_CPUS, dependent) unavailability of early fixed mappings during the
time the fixmap area page tables get constructed.

Finally, also replace the hard coded calculation of the initial table space
needed for the fixmap area with a proper one, allowing kernels configured for
large CPU counts to actually boot.

Based-on: Nick Piggin <npiggin@suse.de>
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 arch/x86/mm/ioremap.c | 25 -------------------------
 2 files changed, 45 insertions(+), 28 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 88f1b10de3b..2cef0507441 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -138,6 +138,47 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+					   unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * Something (early fixmap) may already have put a pte
+	 * page here, which causes the page table allocation
+	 * to become nonlinear. Attempt to fix it, and if it
+	 * is still nonlinear then we have to bug.
+	 */
+	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+		pte_t *newpte;
+		int i;
+
+		BUG_ON(after_init_bootmem);
+		newpte = alloc_low_page();
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			set_pte(newpte + i, pte[i]);
+
+		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+		BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+		__flush_tlb_all();
+
+		paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+		pte = newpte;
+	}
+	BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+	       && vaddr > fix_to_virt(FIX_KMAP_END)
+	       && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+	return pte;
+}
+
 /*
  * This function initializes a certain range of kernel virtual memory
  * with new bootmem page tables, everywhere page tables are missing in
@@ -154,6 +195,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 	unsigned long vaddr;
 	pgd_t *pgd;
 	pmd_t *pmd;
+	pte_t *pte = NULL;
 
 	vaddr = start;
 	pgd_idx = pgd_index(vaddr);
@@ -165,7 +207,8 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
 		pmd = pmd + pmd_index(vaddr);
 		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
 							pmd++, pmd_idx++) {
-			one_page_table_init(pmd);
+			pte = page_table_kmap_check(one_page_table_init(pmd),
+			                            pmd, vaddr, pte);
 
 			vaddr += PMD_SIZE;
 		}
@@ -508,7 +551,6 @@ static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
 	 * Fixed mappings, only the page table structure has to be
 	 * created - mappings will be set by set_fixmap():
 	 */
-	early_ioremap_clear();
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
 	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
 	page_table_range_init(vaddr, end, pgd_base);
@@ -801,7 +843,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse)
 	tables += PAGE_ALIGN(ptes * sizeof(pte_t));
 
 	/* for fixmap */
-	tables += PAGE_SIZE * 2;
+	tables += PAGE_ALIGN(__end_of_fixed_addresses * sizeof(pte_t));
 
 	/*
 	 * RED-PEN putting page tables only on node 0 could
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bd85d42819e..af750ab973b 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -557,34 +557,9 @@ void __init early_ioremap_init(void)
 	}
 }
 
-void __init early_ioremap_clear(void)
-{
-	pmd_t *pmd;
-
-	if (early_ioremap_debug)
-		printk(KERN_INFO "early_ioremap_clear()\n");
-
-	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
-	pmd_clear(pmd);
-	paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
-	__flush_tlb_all();
-}
-
 void __init early_ioremap_reset(void)
 {
-	enum fixed_addresses idx;
-	unsigned long addr, phys;
-	pte_t *pte;
-
 	after_paging_init = 1;
-	for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) {
-		addr = fix_to_virt(idx);
-		pte = early_ioremap_pte(addr);
-		if (pte_present(*pte)) {
-			phys = pte_val(*pte) & PAGE_MASK;
-			set_fixmap(idx, phys);
-		}
-	}
 }
 
 static void __init __early_set_fixmap(enum fixed_addresses idx,
-- 
cgit v1.2.3


From f5495506c3c1300d249d403c36f92de71920dbeb Mon Sep 17 00:00:00 2001
From: Gary Hade <garyhade@us.ibm.com>
Date: Mon, 19 Jan 2009 13:46:41 -0800
Subject: x86: remove kernel_physical_mapping_init() from init section

Impact: fix crash with memory hotplug enabled

kernel_physical_mapping_init() is called during memory hotplug
so it does not belong in the init section.

If the kernel is built with CONFIG_DEBUG_SECTION_MISMATCH=y on
the make command line, arch/x86/mm/init_64.c is compiled with
the -fno-inline-functions-called-once gcc option defeating
inlining of kernel_physical_mapping_init() within init_memory_mapping().

When kernel_physical_mapping_init() is not inlined it is placed
in the .init.text section according to the __init in it's current
declaration.  A later call to kernel_physical_mapping_init() during
a memory hotplug operation encounters an int3 trap because the
.init.text section memory has been freed.

This patch eliminates the crash caused by the int3 trap by moving the
non-inlined kernel_physical_mapping_init() from .init.text to .meminit.text.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 23f68e77ad1..e6d36b49025 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,7 +596,7 @@ static void __init init_gbpages(void)
 		direct_gbpages = 0;
 }
 
-static unsigned long __init kernel_physical_mapping_init(unsigned long start,
+static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
 						unsigned long end,
 						unsigned long page_size_mask)
 {
-- 
cgit v1.2.3


From 92181f190b649f7ef2b79cbf5c00f26ccc66da2a Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 20 Jan 2009 04:24:26 +0100
Subject: x86: optimise x86's do_page_fault (C entry point for the page fault
 path)

Impact: cleanup, restructure code to improve assembly

gcc isn't _all_ that smart about spilling registers to stack or reusing
stack slots, even with branch annotations. do_page_fault contained a lot
of functionality, so split unlikely paths into their own functions, and
mark them as noinline just to be sure. I consider this actually to be
somewhat of a cleanup too: the main function now contains about half
the number of lines so the normal path is easier to read, while the error
cases are also nicely split away.

Also, ensure the order of arguments to functions is always the same: regs,
addr, error_code. This can reduce code size a tiny bit, and just looks neater
too.

And add a couple of branch annotations.

Before:
  do_page_fault:
          subq    $360, %rsp      #,

After:
  do_page_fault:
          subq    $56, %rsp       #,

bloat-o-meter:
  add/remove: 8/0 grow/shrink: 0/1 up/down: 2222/-1680 (542)
  function                                     old     new   delta
  __bad_area_nosemaphore                         -     506    +506
  no_context                                     -     474    +474
  vmalloc_fault                                  -     424    +424
  spurious_fault                                 -     358    +358
  mm_fault_error                                 -     272    +272
  bad_area_access_error                          -      89     +89
  bad_area                                       -      89     +89
  bad_area_nosemaphore                           -      10     +10
  do_page_fault                               2464     784   -1680

Yes, the total size increases by 542 bytes, due to the extra function calls.
But these will very rarely be called (except for vmalloc_fault) in a normal
workload. Importantly, do_page_fault is less than 1/3rd it's original size,
and touches far less stack.

Existing gotos and branch hints did move a lot of the infrequently used text
out of the fastpath, but that's even further improved after this patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 438 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 256 insertions(+), 182 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 90dfae511a4..033292dc9e2 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -91,8 +91,8 @@ static inline int notify_page_fault(struct pt_regs *regs)
  *
  * Opcode checker based on code by Richard Brunner
  */
-static int is_prefetch(struct pt_regs *regs, unsigned long addr,
-		       unsigned long error_code)
+static int is_prefetch(struct pt_regs *regs, unsigned long error_code,
+			unsigned long addr)
 {
 	unsigned char *instr;
 	int scan_more = 1;
@@ -409,15 +409,15 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 }
 
 #ifdef CONFIG_X86_64
-static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
-				 unsigned long error_code)
+static noinline void pgtable_bad(struct pt_regs *regs,
+			 unsigned long error_code, unsigned long address)
 {
 	unsigned long flags = oops_begin();
 	int sig = SIGKILL;
-	struct task_struct *tsk;
+	struct task_struct *tsk = current;
 
 	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
-	       current->comm, address);
+	       tsk->comm, address);
 	dump_pagetable(address);
 	tsk = current;
 	tsk->thread.cr2 = address;
@@ -429,6 +429,190 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
 }
 #endif
 
+static noinline void no_context(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	struct task_struct *tsk = current;
+#ifdef CONFIG_X86_64
+	unsigned long flags;
+	int sig;
+#endif
+
+	/* Are we prepared to handle this kernel fault?  */
+	if (fixup_exception(regs))
+		return;
+
+	/*
+	 * X86_32
+	 * Valid to do another page fault here, because if this fault
+	 * had been triggered by is_prefetch fixup_exception would have
+	 * handled it.
+	 *
+	 * X86_64
+	 * Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to
+	 * terminate things with extreme prejudice.
+	 */
+#ifdef CONFIG_X86_32
+	bust_spinlocks(1);
+#else
+	flags = oops_begin();
+#endif
+
+	show_fault_oops(regs, error_code, address);
+
+	tsk->thread.cr2 = address;
+	tsk->thread.trap_no = 14;
+	tsk->thread.error_code = error_code;
+
+#ifdef CONFIG_X86_32
+	die("Oops", regs, error_code);
+	bust_spinlocks(0);
+	do_exit(SIGKILL);
+#else
+	sig = SIGKILL;
+	if (__die("Oops", regs, error_code))
+		sig = 0;
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_EMERG "CR2: %016lx\n", address);
+	oops_end(flags, regs, sig);
+#endif
+}
+
+static void __bad_area_nosemaphore(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address,
+			int si_code)
+{
+	struct task_struct *tsk = current;
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here.
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space.
+		 */
+		if (is_prefetch(regs, error_code, address))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk(
+			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+			tsk->comm, task_pid_nr(tsk), address,
+			(void *) regs->ip, (void *) regs->sp, error_code);
+			print_vma_addr(" in ", regs->ip);
+			printk("\n");
+		}
+
+		tsk->thread.cr2 = address;
+		/* Kernel addresses are always protection faults */
+		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+		tsk->thread.trap_no = 14;
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+	no_context(regs, error_code, address);
+}
+
+static noinline void bad_area_nosemaphore(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void __bad_area(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address,
+			int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	__bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void bad_area(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void bad_area_access_error(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void out_of_memory(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	/*
+	 * We ran out of memory, call the OOM killer, and return the userspace
+	 * (which will retry the fault, or kill us if we got oom-killed).
+	 */
+	up_read(&current->mm->mmap_sem);
+	pagefault_out_of_memory();
+}
+
+static void do_sigbus(struct pt_regs *regs,
+			unsigned long error_code, unsigned long address)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+
+	up_read(&mm->mmap_sem);
+
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & PF_USER))
+		no_context(regs, error_code, address);
+#ifdef CONFIG_X86_32
+	/* User space => ok to do another page fault */
+	if (is_prefetch(regs, error_code, address))
+		return;
+#endif
+	tsk->thread.cr2 = address;
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 14;
+	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+}
+
+static noinline void mm_fault_error(struct pt_regs *regs,
+		unsigned long error_code, unsigned long address, unsigned int fault)
+{
+	if (fault & VM_FAULT_OOM)
+		out_of_memory(regs, error_code, address);
+	else if (fault & VM_FAULT_SIGBUS)
+		do_sigbus(regs, error_code, address);
+	else
+		BUG();
+}
+
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
 	if ((error_code & PF_WRITE) && !pte_write(*pte))
@@ -448,8 +632,8 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
  */
-static int spurious_fault(unsigned long address,
-			  unsigned long error_code)
+static noinline int spurious_fault(unsigned long error_code,
+				unsigned long address)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -494,7 +678,7 @@ static int spurious_fault(unsigned long address,
  *
  * This assumes no large pages in there.
  */
-static int vmalloc_fault(unsigned long address)
+static noinline int vmalloc_fault(unsigned long address)
 {
 #ifdef CONFIG_X86_32
 	unsigned long pgd_paddr;
@@ -573,6 +757,25 @@ static int vmalloc_fault(unsigned long address)
 
 int show_unhandled_signals = 1;
 
+static inline int access_error(unsigned long error_code, int write,
+				struct vm_area_struct *vma)
+{
+	if (write) {
+		/* write, present and write, not present */
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return 1;
+	} else if (unlikely(error_code & PF_PROT)) {
+		/* read, present */
+		return 1;
+	} else {
+		/* read, not present */
+		if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -583,16 +786,12 @@ asmlinkage
 #endif
 void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+	unsigned long address;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
-	unsigned long address;
-	int write, si_code;
+	int write;
 	int fault;
-#ifdef CONFIG_X86_64
-	unsigned long flags;
-	int sig;
-#endif
 
 	tsk = current;
 	mm = tsk->mm;
@@ -601,9 +800,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	/* get the address */
 	address = read_cr2();
 
-	si_code = SEGV_MAPERR;
-
-	if (notify_page_fault(regs))
+	if (unlikely(notify_page_fault(regs)))
 		return;
 	if (unlikely(kmmio_fault(regs, address)))
 		return;
@@ -631,17 +828,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 			return;
 
 		/* Can handle a stale RO->RW TLB */
-		if (spurious_fault(address, error_code))
+		if (spurious_fault(error_code, address))
 			return;
 
 		/*
 		 * Don't take the mm semaphore here. If we fixup a prefetch
 		 * fault we could otherwise deadlock.
 		 */
-		goto bad_area_nosemaphore;
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
 	}
 
-
 	/*
 	 * It's safe to allow irq's after cr2 has been saved and the
 	 * vmalloc fault has been handled.
@@ -657,15 +854,17 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 #ifdef CONFIG_X86_64
 	if (unlikely(error_code & PF_RSVD))
-		pgtable_bad(address, regs, error_code);
+		pgtable_bad(regs, error_code, address);
 #endif
 
 	/*
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault.
 	 */
-	if (unlikely(in_atomic() || !mm))
-		goto bad_area_nosemaphore;
+	if (unlikely(in_atomic() || !mm)) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
 
 	/*
 	 * When running in the kernel we expect faults to occur only to
@@ -683,20 +882,26 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if ((error_code & PF_USER) == 0 &&
-		    !search_exception_tables(regs->ip))
-			goto bad_area_nosemaphore;
+		    !search_exception_tables(regs->ip)) {
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
 		down_read(&mm->mmap_sem);
 	}
 
 	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
+	if (unlikely(!vma)) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (likely(vma->vm_start <= address))
 		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
 	if (error_code & PF_USER) {
 		/*
 		 * Accessing the stack below %sp is always a bug.
@@ -704,31 +909,25 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		 * and pusha to work.  ("enter $65535,$31" pushes
 		 * 32 pointers and then decrements %sp by 65535.)
 		 */
-		if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
-			goto bad_area;
+		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+			bad_area(regs, error_code, address);
+			return;
+		}
 	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+	if (unlikely(expand_stack(vma, address))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
 good_area:
-	si_code = SEGV_ACCERR;
-	write = 0;
-	switch (error_code & (PF_PROT|PF_WRITE)) {
-	default:	/* 3: write, present */
-		/* fall through */
-	case PF_WRITE:		/* write, not present */
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-		write++;
-		break;
-	case PF_PROT:		/* read, present */
-		goto bad_area;
-	case 0:			/* read, not present */
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
+	write = error_code & PF_WRITE;
+	if (unlikely(access_error(error_code, write, vma))) {
+		bad_area_access_error(regs, error_code, address);
+		return;
 	}
 
 	/*
@@ -738,11 +937,8 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, write);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		if (fault & VM_FAULT_OOM)
-			goto out_of_memory;
-		else if (fault & VM_FAULT_SIGBUS)
-			goto do_sigbus;
-		BUG();
+		mm_fault_error(regs, error_code, address, fault);
+		return;
 	}
 	if (fault & VM_FAULT_MAJOR)
 		tsk->maj_flt++;
@@ -760,128 +956,6 @@ good_area:
 	}
 #endif
 	up_read(&mm->mmap_sem);
-	return;
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
-		/*
-		 * It's possible to have interrupts off here.
-		 */
-		local_irq_enable();
-
-		/*
-		 * Valid to do another page fault here because this one came
-		 * from user space.
-		 */
-		if (is_prefetch(regs, address, error_code))
-			return;
-
-		if (is_errata100(regs, address))
-			return;
-
-		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-		    printk_ratelimit()) {
-			printk(
-			"%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
-			task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
-			tsk->comm, task_pid_nr(tsk), address,
-			(void *) regs->ip, (void *) regs->sp, error_code);
-			print_vma_addr(" in ", regs->ip);
-			printk("\n");
-		}
-
-		tsk->thread.cr2 = address;
-		/* Kernel addresses are always protection faults */
-		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
-		tsk->thread.trap_no = 14;
-		force_sig_info_fault(SIGSEGV, si_code, address, tsk);
-		return;
-	}
-
-	if (is_f00f_bug(regs, address))
-		return;
-
-no_context:
-	/* Are we prepared to handle this kernel fault?  */
-	if (fixup_exception(regs))
-		return;
-
-	/*
-	 * X86_32
-	 * Valid to do another page fault here, because if this fault
-	 * had been triggered by is_prefetch fixup_exception would have
-	 * handled it.
-	 *
-	 * X86_64
-	 * Hall of shame of CPU/BIOS bugs.
-	 */
-	if (is_prefetch(regs, address, error_code))
-		return;
-
-	if (is_errata93(regs, address))
-		return;
-
-/*
- * Oops. The kernel tried to access some bad page. We'll have to
- * terminate things with extreme prejudice.
- */
-#ifdef CONFIG_X86_32
-	bust_spinlocks(1);
-#else
-	flags = oops_begin();
-#endif
-
-	show_fault_oops(regs, error_code, address);
-
-	tsk->thread.cr2 = address;
-	tsk->thread.trap_no = 14;
-	tsk->thread.error_code = error_code;
-
-#ifdef CONFIG_X86_32
-	die("Oops", regs, error_code);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-#else
-	sig = SIGKILL;
-	if (__die("Oops", regs, error_code))
-		sig = 0;
-	/* Executive summary in case the body of the oops scrolled away */
-	printk(KERN_EMERG "CR2: %016lx\n", address);
-	oops_end(flags, regs, sig);
-#endif
-
-out_of_memory:
-	/*
-	 * We ran out of memory, call the OOM killer, and return the userspace
-	 * (which will retry the fault, or kill us if we got oom-killed).
-	 */
-	up_read(&mm->mmap_sem);
-	pagefault_out_of_memory();
-	return;
-
-do_sigbus:
-	up_read(&mm->mmap_sem);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & PF_USER))
-		goto no_context;
-#ifdef CONFIG_X86_32
-	/* User space => ok to do another page fault */
-	if (is_prefetch(regs, address, error_code))
-		return;
-#endif
-	tsk->thread.cr2 = address;
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 14;
-	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 }
 
 DEFINE_SPINLOCK(pgd_lock);
-- 
cgit v1.2.3


From 55f4949f5765e7a29863b6d17a774601810732f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 21 Jan 2009 10:08:53 +0100
Subject: x86, mm: move tlb.c to arch/x86/mm/

Impact: cleanup

Now that it's unified, move the (SMP) TLB flushing code from arch/x86/kernel/
to arch/x86/mm/, where it belongs logically.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/Makefile |   2 +
 arch/x86/mm/tlb.c    | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 arch/x86/mm/tlb.c

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index d8cc96a2738..9f05157220f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,6 +1,8 @@
 obj-y	:=  init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o gup.o
 
+obj-$(CONFIG_X86_SMP)		+= tlb.o
+
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 00000000000..b3ca1b94065
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,296 @@
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
+			= { &init_mm, 0, };
+
+#include <mach_ipi.h>
+/*
+ *	Smarter SMP flushing macros.
+ *		c/o Linus Torvalds.
+ *
+ *	These mean you can really definitely utterly forget about
+ *	writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *	Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *	More scalable flush, from Andi Kleen
+ *
+ *	To avoid global state use 8 different call vectors.
+ *	Each CPU uses a specific vector to trigger flushes on other
+ *	CPUs. Depending on the received vector the target CPUs look into
+ *	the right per cpu variable for the flush data.
+ *
+ *	With more than 8 CPUs they are hashed to the 8 available
+ *	vectors. The limited global vector space forces us to this right now.
+ *	In future when interrupts are split into per CPU domains this could be
+ *	fixed, at the cost of triggering multiple IPIs in some cases.
+ */
+
+union smp_flush_state {
+	struct {
+		struct mm_struct *flush_mm;
+		unsigned long flush_va;
+		spinlock_t tlbstate_lock;
+		DECLARE_BITMAP(flush_cpumask, NR_CPUS);
+	};
+	char pad[SMP_CACHE_BYTES];
+} ____cacheline_aligned;
+
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		BUG();
+	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	load_cr3(swapper_pg_dir);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *	Stop ipi delivery for the old mm. This is not synchronized with
+ *	the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ *	for the wrong mm, and in the worst case we perform a superfluous
+ *	tlb flush.
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
+ *	Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ *	was in lazy tlb mode.
+ * 1a3) update cpu active_mm
+ *	Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *	Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ *	cpu active_mm is correct, cpu0 already handles
+ *	flush ipis.
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *	Atomically set the bit [other cpus will start sending flush ipis],
+ *	and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu mmu_state is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
+ */
+
+/*
+ * FIXME: use of asmlinkage is not consistent.  On x86_64 it's noop
+ * but still used for documentation purpose but the usage is slightly
+ * inconsistent.  On x86_32, asmlinkage is regparm(0) but interrupt
+ * entry calls in with the first parameter in %eax.  Maybe define
+ * intrlinkage?
+ */
+#ifdef CONFIG_X86_64
+asmlinkage
+#endif
+void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+	unsigned int cpu;
+	unsigned int sender;
+	union smp_flush_state *f;
+
+	cpu = smp_processor_id();
+	/*
+	 * orig_rax contains the negated interrupt vector.
+	 * Use that to determine where the sender put the data.
+	 */
+	sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
+	f = &per_cpu(flush_state, sender);
+
+	if (!cpumask_test_cpu(cpu, to_cpumask(f->flush_cpumask)))
+		goto out;
+		/*
+		 * This was a BUG() but until someone can quote me the
+		 * line from the intel manual that guarantees an IPI to
+		 * multiple CPUs is retried _only_ on the erroring CPUs
+		 * its staying as a return
+		 *
+		 * BUG();
+		 */
+
+	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+			if (f->flush_va == TLB_FLUSH_ALL)
+				local_flush_tlb();
+			else
+				__flush_tlb_one(f->flush_va);
+		} else
+			leave_mm(cpu);
+	}
+out:
+	ack_APIC_irq();
+	smp_mb__before_clear_bit();
+	cpumask_clear_cpu(cpu, to_cpumask(f->flush_cpumask));
+	smp_mb__after_clear_bit();
+	inc_irq_stat(irq_tlb_count);
+}
+
+static void flush_tlb_others_ipi(const struct cpumask *cpumask,
+				 struct mm_struct *mm, unsigned long va)
+{
+	unsigned int sender;
+	union smp_flush_state *f;
+
+	/* Caller has disabled preemption */
+	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+	f = &per_cpu(flush_state, sender);
+
+	/*
+	 * Could avoid this lock when
+	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
+	 * probably not worth checking this for a cache-hot lock.
+	 */
+	spin_lock(&f->tlbstate_lock);
+
+	f->flush_mm = mm;
+	f->flush_va = va;
+	cpumask_andnot(to_cpumask(f->flush_cpumask),
+		       cpumask, cpumask_of(smp_processor_id()));
+
+	/*
+	 * Make the above memory operations globally visible before
+	 * sending the IPI.
+	 */
+	smp_mb();
+	/*
+	 * We have to send the IPI only to
+	 * CPUs affected.
+	 */
+	send_IPI_mask(to_cpumask(f->flush_cpumask),
+		      INVALIDATE_TLB_VECTOR_START + sender);
+
+	while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+		cpu_relax();
+
+	f->flush_mm = NULL;
+	f->flush_va = 0;
+	spin_unlock(&f->tlbstate_lock);
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+			     struct mm_struct *mm, unsigned long va)
+{
+	if (is_uv_system()) {
+		unsigned int cpu;
+
+		cpu = get_cpu();
+		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+		if (cpumask)
+			flush_tlb_others_ipi(cpumask, mm, va);
+		put_cpu();
+		return;
+	}
+	flush_tlb_others_ipi(cpumask, mm, va);
+}
+
+static int __cpuinit init_smp_flush(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
+
+	return 0;
+}
+core_initcall(init_smp_flush);
+
+void flush_tlb_current_task(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	preempt_disable();
+
+	local_flush_tlb();
+	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+	preempt_enable();
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+	preempt_disable();
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			local_flush_tlb();
+		else
+			leave_mm(smp_processor_id());
+	}
+	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL);
+
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	preempt_disable();
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			__flush_tlb_one(va);
+		else
+			leave_mm(smp_processor_id());
+	}
+
+	if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(&mm->cpu_vm_mask, mm, va);
+
+	preempt_enable();
+}
+
+static void do_flush_tlb_all(void *info)
+{
+	unsigned long cpu = smp_processor_id();
+
+	__flush_tlb_all();
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+		leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
-- 
cgit v1.2.3


From 4ec71fa2d2c3f1040348f2604f4b8ccc833d1c2e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 21 Jan 2009 10:24:27 +0100
Subject: x86: uv cleanup, build fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

 arch/x86/mm/srat_64.c: In function ‘acpi_numa_processor_affinity_init’:
 arch/x86/mm/srat_64.c:141: error: implicit declaration of function ‘get_uv_system_type’
 arch/x86/mm/srat_64.c:141: error: ‘UV_X2APIC’ undeclared (first use in this function)
 arch/x86/mm/srat_64.c:141: error: (Each undeclared identifier is reported only once
 arch/x86/mm/srat_64.c:141: error: for each function it appears in.)

A couple of UV definitions were moved to asm/uv/uv.h, but srat_64.c did
not include that header. Add it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 09737c8af07..15df1baee10 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -21,6 +21,7 @@
 #include <asm/numa.h>
 #include <asm/e820.h>
 #include <asm/genapic.h>
+#include <asm/uv/uv.h>
 
 int acpi_numa __initdata;
 
-- 
cgit v1.2.3


From a1e46212a410793d575718818e81ddc442a65283 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 20 Jan 2009 14:20:21 -0800
Subject: x86: fix page attribute corruption with cpa()

Impact: fix sporadic slowdowns and warning messages

This patch fixes a performance issue reported by Linus on his
Nehalem system. While Linus reverted the PAT patch (commit
58dab916dfb57328d50deb0aa9b3fc92efa248ff) which exposed the issue,
existing cpa() code can potentially still cause wrong(page attribute
corruption) behavior.

This patch also fixes the "WARNING: at arch/x86/mm/pageattr.c:560" that
various people reported.

In 64bit kernel, kernel identity mapping might have holes depending
on the available memory and how e820 reports the address range
covering the RAM, ACPI, PCI reserved regions. If there is a 2MB/1GB hole
in the address range that is not listed by e820 entries, kernel identity
mapping will have a corresponding hole in its 1-1 identity mapping.

If cpa() happens on the kernel identity mapping which falls into these holes,
existing code fails like this:

	__change_page_attr_set_clr()
		__change_page_attr()
			returns 0 because of if (!kpte). But doesn't
			set cpa->numpages and cpa->pfn.
		cpa_process_alias()
			uses uninitialized cpa->pfn (random value)
			which can potentially lead to changing the page
			attribute of kernel text/data, kernel identity
			mapping of RAM pages etc. oops!

This bug was easily exposed by another PAT patch which was doing
cpa() more often on kernel identity mapping holes (physical range between
max_low_pfn_mapped and 4GB), where in here it was setting the
cache disable attribute(PCD) for kernel identity mappings aswell.

Fix cpa() to handle the kernel identity mapping holes. Retain
the WARN() for cpa() calls to other not present address ranges
(kernel-text/data, ioremap() addresses)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e89d24815f2..84ba74820ad 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -534,6 +534,36 @@ out_unlock:
 	return 0;
 }
 
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+			       int primary)
+{
+	/*
+	 * Ignore all non primary paths.
+	 */
+	if (!primary)
+		return 0;
+
+	/*
+	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
+	 * to have holes.
+	 * Also set numpages to '1' indicating that we processed cpa req for
+	 * one virtual address page and its pfn. TBD: numpages can be set based
+	 * on the initial value and the level returned by lookup_address().
+	 */
+	if (within(vaddr, PAGE_OFFSET,
+		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+		cpa->numpages = 1;
+		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+		return 0;
+	} else {
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
+			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+			*cpa->vaddr);
+
+		return -EFAULT;
+	}
+}
+
 static int __change_page_attr(struct cpa_data *cpa, int primary)
 {
 	unsigned long address;
@@ -549,17 +579,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
 repeat:
 	kpte = lookup_address(address, &level);
 	if (!kpte)
-		return 0;
+		return __cpa_process_fault(cpa, address, primary);
 
 	old_pte = *kpte;
-	if (!pte_val(old_pte)) {
-		if (!primary)
-			return 0;
-		WARN(1, KERN_WARNING "CPA: called for zero pte. "
-		       "vaddr = %lx cpa->vaddr = %lx\n", address,
-		       *cpa->vaddr);
-		return -EINVAL;
-	}
+	if (!pte_val(old_pte))
+		return __cpa_process_fault(cpa, address, primary);
 
 	if (level == PG_LEVEL_4K) {
 		pte_t new_pte;
@@ -657,12 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 		vaddr = *cpa->vaddr;
 
 	if (!(within(vaddr, PAGE_OFFSET,
-		    PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
-#ifdef CONFIG_X86_64
-		|| within(vaddr, PAGE_OFFSET + (1UL<<32),
-		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
-#endif
-	)) {
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 
 		alias_cpa = *cpa;
 		temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
-- 
cgit v1.2.3


From 9597134218300c045cf219be3664615e97cb239c Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 13 Jan 2009 10:21:30 -0800
Subject: x86: fix PTE corruption issue while mapping RAM using /dev/mem

Beschorner Daniel reported:
> hwinfo problem since 2.6.28, showing this in the oops:
>	Corrupted page table at address 7fd04de3ec00

Also, PaX Team reported a regression with this commit:

>	commit 9542ada803198e6eba29d3289abb39ea82047b92
>	Author: Suresh Siddha <suresh.b.siddha@intel.com>
>	Date:   Wed Sep 24 08:53:33 2008 -0700
>
>	    x86: track memtype for RAM in page struct

This commit breaks mapping any RAM page through /dev/mem, as the
reserve_memtype() was not initializing the return attribute type and as such
corrupting the PTE entry that was setup with the return attribute type.

Because of this bug, application mapping this RAM page through /dev/mem
will die with "Corrupted page table at address xxxx" message in the kernel
log and also the kernel identity mapping which maps the underlying RAM
page gets converted to UC.

Fix this by initializing the return attribute type before calling
reserve_ram_pages_type()

Reported-by: PaX Team <pageexec@freemail.hu>
Reported-and-tested-by: Beschorner Daniel <Daniel.Beschorner@facton.com>
Tested-and-Acked-by: PaX Team <pageexec@freemail.hu>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 070ee4a3b22..ffc88cc00fd 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -333,6 +333,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 					      req_type & _PAGE_CACHE_MASK);
 	}
 
+	if (new_type)
+		*new_type = actual_type;
+
 	/*
 	 * For legacy reasons, some parts of the physical address range in the
 	 * legacy 1MB region is treated as non-RAM (even when listed as RAM in
@@ -356,9 +359,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	new->end	= end;
 	new->type	= actual_type;
 
-	if (new_type)
-		*new_type = actual_type;
-
 	spin_lock(&memtype_lock);
 
 	if (cached_entry && start >= cached_start)
-- 
cgit v1.2.3


From ef5fa0ab24b87646c7bc98645acbb4b51fc2acd4 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 23 Jan 2009 14:14:21 -0800
Subject: x86: work around PAGE_KERNEL_WC not getting WC in
 iomap_atomic_prot_pfn.

In the absence of PAT, PAGE_KERNEL_WC ends up mapping to a memory type that
gets UC behavior even in the presence of a WC MTRR covering the area in
question.  By swapping to PAGE_KERNEL_UC_MINUS, we can get the actual
behavior the caller wanted (WC if you can manage it, UC otherwise).

This recovers the 40% performance improvement of using WC in the DRM
to upload vertex data.

Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/iomap_32.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index d0151d8ce45..ca53224fc56 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -17,6 +17,7 @@
  */
 
 #include <asm/iomap.h>
+#include <asm/pat.h>
 #include <linux/module.h>
 
 /* Map 'pfn' using fixed map 'type' and protections 'prot'
@@ -29,6 +30,15 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
 
 	pagefault_disable();
 
+	/*
+	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 */
+	if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+		prot = PAGE_KERNEL_UC_MINUS;
+
 	idx = type + KM_TYPE_NR*smp_processor_id();
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	set_pte(kmap_pte-idx, pfn_pte(pfn, prot));
-- 
cgit v1.2.3


From 6470aff619fbb9dff8dfe8afa5033084cd55ca20 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Tue, 27 Jan 2009 12:56:47 +0900
Subject: x86: move 64-bit NUMA code

Impact: Code movement, no functional change.

Move the 64-bit NUMA code from setup_percpu.c to numa_64.c

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/mm/numa_64.c | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)

(limited to 'arch/x86/mm')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 71a14f89f89..08d140fbc31 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,6 +20,12 @@
 #include <asm/acpi.h>
 #include <asm/k8.h>
 
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
@@ -33,6 +39,21 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
+DEFINE_PER_CPU(int, node_number) = 0;
+EXPORT_PER_CPU_SYMBOL(node_number);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+/*
+ * Which logical CPUs are on which nodes
+ */
+cpumask_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
 /*
  * Given a shift value, try to populate memnodemap[]
  * Returns :
@@ -640,3 +661,199 @@ void __init init_cpu_to_node(void)
 #endif
 
 
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node, num = 0;
+	cpumask_t *map;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES) {
+		for_each_node_mask(node, node_possible_map)
+			num = node;
+		nr_node_ids = num + 1;
+	}
+
+	/* allocate the map */
+	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+	DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
+
+	pr_debug("Node to cpumask map at %p for %d nodes\n",
+		 map, nr_node_ids);
+
+	/* node_to_cpumask() will now work */
+	node_to_cpumask_map = map;
+}
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	/* early setting, no percpu area yet */
+	if (cpu_to_node_map) {
+		cpu_to_node_map[cpu] = node;
+		return;
+	}
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+		dump_stack();
+		return;
+	}
+#endif
+	per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	if (node != NUMA_NO_NODE)
+		per_cpu(node_number, cpu) = node;
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+
+/*
+ * --------- debug versions of the numa functions ---------
+ */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	int node = early_cpu_to_node(cpu);
+	cpumask_t *mask;
+	char buf[64];
+
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_ERR "node_to_cpumask_map NULL\n");
+		dump_stack();
+		return;
+	}
+
+	mask = &node_to_cpumask_map[node];
+	if (enable)
+		cpu_set(cpu, *mask);
+	else
+		cpu_clear(cpu, *mask);
+
+	cpulist_scnprintf(buf, sizeof(buf), mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+
+int cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!per_cpu_offset(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+
+/* empty cpumask */
+static const cpumask_t cpu_mask_none;
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const cpumask_t *cpumask_of_node(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return (const cpumask_t *)&cpu_online_map;
+	}
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return &cpu_mask_none;
+	}
+	return &node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ *
+ * Side note: this function creates the returned cpumask on the stack
+ * so with a high NR_CPUS count, excessive stack space is used.  The
+ * node_to_cpumask_ptr function should be used whenever possible.
+ */
+cpumask_t node_to_cpumask(int node)
+{
+	if (node_to_cpumask_map == NULL) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
+		dump_stack();
+		return cpu_online_map;
+	}
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"node_to_cpumask(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return cpu_mask_none;
+	}
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(node_to_cpumask);
+
+/*
+ * --------- end of debug versions of the numa functions ---------
+ */
+
+#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
-- 
cgit v1.2.3