From dcc17d1baef3721d1574e5b2f4f2d4607514bcff Mon Sep 17 00:00:00 2001
From: Peter Keilty <peter.keilty@hp.com>
Date: Mon, 31 Oct 2005 16:44:47 -0500
Subject: [IA64] Use bitmaps for efficient context allocation/free

Corrects the very inefficent method of finding free context_ids in
get_mmu_context().  Instead of walking the task_list of all processes,
2 bitmaps are used to efficently store and lookup state, inuse and
needs flushing. The entire rid address space is now used before calling
wrap_mmu_context and global tlb flushing.

Special thanks to Ken and Rohit for their review and modifications in
using a bit flushmap.

Signed-off-by: Peter Keilty <peter.keilty@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/setup.c       |  1 +
 arch/ia64/mm/tlb.c             | 56 ++++++++++++++++++++----------------------
 include/asm-ia64/mmu_context.h | 19 +++++++++++---
 include/asm-ia64/tlbflush.h    |  1 +
 4 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index fc56ca2da35..c9388a92cf4 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -454,6 +454,7 @@ setup_arch (char **cmdline_p)
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index c79a9b96d02..39628fca274 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -8,6 +8,8 @@
  *		Modified RID allocation for SMP
  *          Goutham Rao <goutham.rao@intel.com>
  *              IPI based ptc implementation and A-step IPI implementation.
+ * Rohit Seth <rohit.seth@intel.com>
+ * Ken Chen <kenneth.w.chen@intel.com>
  */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -16,12 +18,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
@@ -31,49 +35,43 @@ static struct {
 struct ia64_ctx ia64_ctx = {
 	.lock =		SPIN_LOCK_UNLOCKED,
 	.next =		1,
-	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
 	.max_ctx =	~0U
 };
 
 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
 
+/*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by boot CPU.
+ */
+void __init
+mmu_context_init (void)
+{
+	ia64_ctx.bitmap = alloc_bootmem((ia64_ctx.max_ctx+1)>>3);
+	ia64_ctx.flushmap = alloc_bootmem((ia64_ctx.max_ctx+1)>>3);
+}
+
 /*
  * Acquire the ia64_ctx.lock before calling this function!
  */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
 	int i;
+	unsigned long flush_bit;
 
-	if (ia64_ctx.next > max_ctx)
-		ia64_ctx.next = 300;	/* skip daemons */
-	ia64_ctx.limit = max_ctx + 1;
-
-	/*
-	 * Scan all the task's mm->context and set proper safe range
-	 */
-
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
+	for (i=0; i <= ia64_ctx.max_ctx / BITS_PER_LONG; i++) {
+		flush_bit = xchg(&ia64_ctx.flushmap[i], 0);
+		ia64_ctx.bitmap[i] ^= flush_bit;
 	}
-	read_unlock(&tasklist_lock);
+ 
+	/* use offset at 300 to skip daemons */
+	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+				ia64_ctx.max_ctx, 300);
+	ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
+				ia64_ctx.max_ctx, ia64_ctx.next);
+
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
 		int cpu = get_cpu(); /* prevent preemption/migration */
diff --git a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h
index 8d6e72f7b08..8d9b30b5f7d 100644
--- a/include/asm-ia64/mmu_context.h
+++ b/include/asm-ia64/mmu_context.h
@@ -32,13 +32,17 @@
 struct ia64_ctx {
 	spinlock_t lock;
 	unsigned int next;	/* next context number to use */
-	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
-	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned int limit;     /* available free range */
+	unsigned int max_ctx;   /* max. context value supported by all CPUs */
+				/* call wrap_mmu_context when next >= max */
+	unsigned long *bitmap;  /* bitmap size is max_ctx+1 */
+	unsigned long *flushmap;/* pending rid to be flushed */
 };
 
 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 
+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
 static inline void
@@ -83,9 +87,16 @@ get_mmu_context (struct mm_struct *mm)
 			context = mm->context;
 			if (context == 0) {
 				cpus_clear(mm->cpu_vm_mask);
-				if (ia64_ctx.next >= ia64_ctx.limit)
-					wrap_mmu_context(mm);
+				if (ia64_ctx.next >= ia64_ctx.limit) {
+					ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+							ia64_ctx.max_ctx, ia64_ctx.next);
+					ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
+							ia64_ctx.max_ctx, ia64_ctx.next);
+					if (ia64_ctx.next >= ia64_ctx.max_ctx)
+						wrap_mmu_context(mm);
+				}
 				mm->context = context = ia64_ctx.next++;
+				__set_bit(context, ia64_ctx.bitmap);
 			}
 		}
 		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
diff --git a/include/asm-ia64/tlbflush.h b/include/asm-ia64/tlbflush.h
index b65c6270272..a35b323bae4 100644
--- a/include/asm-ia64/tlbflush.h
+++ b/include/asm-ia64/tlbflush.h
@@ -51,6 +51,7 @@ flush_tlb_mm (struct mm_struct *mm)
 	if (!mm)
 		return;
 
+	set_bit(mm->context, ia64_ctx.flushmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) == 0)
-- 
cgit v1.2.3


From 58cd90829918dabbd81a453de676d41fb7b628ad Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Sat, 29 Oct 2005 18:47:04 -0700
Subject: [IA64] make mmu_context.h and tlb.c 80-column friendly

wrap_mmu_context(), delayed_tlb_flush(), get_mmu_context() all
have an extra { } block which cause one extra indentation.
get_mmu_context() is particularly bad with 5 indentations to
the most inner "if".  It finally gets on my nerve that I can't
keep the code within 80 columns.  Remove the extra { } block
and while I'm at it, reformat all the comments to 80-column
friendly.  No functional change at all with this patch.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/mm/tlb.c             | 33 +++++++++--------
 include/asm-ia64/mmu_context.h | 80 ++++++++++++++++++++++--------------------
 2 files changed, 59 insertions(+), 54 deletions(-)

diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index 39628fca274..41105d45442 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -29,7 +29,7 @@
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
-	unsigned long max_bits;	/* log2() of largest supported purge page-size */
+	unsigned long max_bits;	/* log2 of largest supported purge page-size */
 } purge;
 
 struct ia64_ctx ia64_ctx = {
@@ -58,7 +58,7 @@ mmu_context_init (void)
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	int i;
+	int i, cpu;
 	unsigned long flush_bit;
 
 	for (i=0; i <= ia64_ctx.max_ctx / BITS_PER_LONG; i++) {
@@ -72,20 +72,21 @@ wrap_mmu_context (struct mm_struct *mm)
 	ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
 				ia64_ctx.max_ctx, ia64_ctx.next);
 
-	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
-	{
-		int cpu = get_cpu(); /* prevent preemption/migration */
-		for_each_online_cpu(i) {
-			if (i != cpu)
-				per_cpu(ia64_need_tlb_flush, i) = 1;
-		}
-		put_cpu();
-	}
+	/*
+	 * can't call flush_tlb_all() here because of race condition
+	 * with O(1) scheduler [EF]
+	 */
+	cpu = get_cpu(); /* prevent preemption/migration */
+	for_each_online_cpu(i)
+		if (i != cpu)
+			per_cpu(ia64_need_tlb_flush, i) = 1;
+	put_cpu();
 	local_flush_tlb_all();
 }
 
 void
-ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long nbits)
+ia64_global_tlb_purge (struct mm_struct *mm, unsigned long start,
+		       unsigned long end, unsigned long nbits)
 {
 	static DEFINE_SPINLOCK(ptcg_lock);
 
@@ -133,7 +134,8 @@ local_flush_tlb_all (void)
 }
 
 void
-flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end)
+flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
+		 unsigned long end)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long size = end - start;
@@ -147,7 +149,8 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long
 #endif
 
 	nbits = ia64_fls(size + 0xfff);
-	while (unlikely (((1UL << nbits) & purge.mask) == 0) && (nbits < purge.max_bits))
+	while (unlikely (((1UL << nbits) & purge.mask) == 0) &&
+			(nbits < purge.max_bits))
 		++nbits;
 	if (nbits > purge.max_bits)
 		nbits = purge.max_bits;
@@ -189,5 +192,5 @@ ia64_tlb_init (void)
 	local_cpu_data->ptce_stride[0] = ptce_info.stride[0];
 	local_cpu_data->ptce_stride[1] = ptce_info.stride[1];
 
-	local_flush_tlb_all();		/* nuke left overs from bootstrapping... */
+	local_flush_tlb_all();	/* nuke left overs from bootstrapping... */
 }
diff --git a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h
index 8d9b30b5f7d..b5c65081a3a 100644
--- a/include/asm-ia64/mmu_context.h
+++ b/include/asm-ia64/mmu_context.h
@@ -7,12 +7,13 @@
  */
 
 /*
- * Routines to manage the allocation of task context numbers.  Task context numbers are
- * used to reduce or eliminate the need to perform TLB flushes due to context switches.
- * Context numbers are implemented using ia-64 region ids.  Since the IA-64 TLB does not
- * consider the region number when performing a TLB lookup, we need to assign a unique
- * region id to each region in a process.  We use the least significant three bits in a
- * region id for this purpose.
+ * Routines to manage the allocation of task context numbers.  Task context
+ * numbers are used to reduce or eliminate the need to perform TLB flushes
+ * due to context switches.  Context numbers are implemented using ia-64
+ * region ids.  Since the IA-64 TLB does not consider the region number when
+ * performing a TLB lookup, we need to assign a unique region id to each
+ * region in a process.  We use the least significant three bits in aregion
+ * id for this purpose.
  */
 
 #define IA64_REGION_ID_KERNEL	0 /* the kernel's region id (tlb.c depends on this being 0) */
@@ -51,10 +52,10 @@ enter_lazy_tlb (struct mm_struct *mm, struct task_struct *tsk)
 }
 
 /*
- * When the context counter wraps around all TLBs need to be flushed because an old
- * context number might have been reused. This is signalled by the ia64_need_tlb_flush
- * per-CPU variable, which is checked in the routine below. Called by activate_mm().
- * <efocht@ess.nec.de>
+ * When the context counter wraps around all TLBs need to be flushed because
+ * an old context number might have been reused. This is signalled by the
+ * ia64_need_tlb_flush per-CPU variable, which is checked in the routine
+ * below. Called by activate_mm(). <efocht@ess.nec.de>
  */
 static inline void
 delayed_tlb_flush (void)
@@ -64,11 +65,9 @@ delayed_tlb_flush (void)
 
 	if (unlikely(__ia64_per_cpu_var(ia64_need_tlb_flush))) {
 		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		{
-			if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
-				local_flush_tlb_all();
-				__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
-			}
+		if (__ia64_per_cpu_var(ia64_need_tlb_flush)) {
+			local_flush_tlb_all();
+			__ia64_per_cpu_var(ia64_need_tlb_flush) = 0;
 		}
 		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
 	}
@@ -80,27 +79,27 @@ get_mmu_context (struct mm_struct *mm)
 	unsigned long flags;
 	nv_mm_context_t context = mm->context;
 
-	if (unlikely(!context)) {
-		spin_lock_irqsave(&ia64_ctx.lock, flags);
-		{
-			/* re-check, now that we've got the lock: */
-			context = mm->context;
-			if (context == 0) {
-				cpus_clear(mm->cpu_vm_mask);
-				if (ia64_ctx.next >= ia64_ctx.limit) {
-					ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
-							ia64_ctx.max_ctx, ia64_ctx.next);
-					ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
-							ia64_ctx.max_ctx, ia64_ctx.next);
-					if (ia64_ctx.next >= ia64_ctx.max_ctx)
-						wrap_mmu_context(mm);
-				}
-				mm->context = context = ia64_ctx.next++;
-				__set_bit(context, ia64_ctx.bitmap);
-			}
+	if (likely(context))
+		goto out;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	/* re-check, now that we've got the lock: */
+	context = mm->context;
+	if (context == 0) {
+		cpus_clear(mm->cpu_vm_mask);
+		if (ia64_ctx.next >= ia64_ctx.limit) {
+			ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+					ia64_ctx.max_ctx, ia64_ctx.next);
+			ia64_ctx.limit = find_next_bit(ia64_ctx.bitmap,
+					ia64_ctx.max_ctx, ia64_ctx.next);
+			if (ia64_ctx.next >= ia64_ctx.max_ctx)
+				wrap_mmu_context(mm);
 		}
-		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+		mm->context = context = ia64_ctx.next++;
+		__set_bit(context, ia64_ctx.bitmap);
 	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+out:
 	/*
 	 * Ensure we're not starting to use "context" before any old
 	 * uses of it are gone from our TLB.
@@ -111,8 +110,8 @@ get_mmu_context (struct mm_struct *mm)
 }
 
 /*
- * Initialize context number to some sane value.  MM is guaranteed to be a brand-new
- * address-space, so no TLB flushing is needed, ever.
+ * Initialize context number to some sane value.  MM is guaranteed to be a
+ * brand-new address-space, so no TLB flushing is needed, ever.
  */
 static inline int
 init_new_context (struct task_struct *p, struct mm_struct *mm)
@@ -173,7 +172,10 @@ activate_context (struct mm_struct *mm)
 		if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
 			cpu_set(smp_processor_id(), mm->cpu_vm_mask);
 		reload_context(context);
-		/* in the unlikely event of a TLB-flush by another thread, redo the load: */
+		/*
+		 * in the unlikely event of a TLB-flush by another thread,
+		 * redo the load.
+		 */
 	} while (unlikely(context != mm->context));
 }
 
@@ -186,8 +188,8 @@ static inline void
 activate_mm (struct mm_struct *prev, struct mm_struct *next)
 {
 	/*
-	 * We may get interrupts here, but that's OK because interrupt handlers cannot
-	 * touch user-space.
+	 * We may get interrupts here, but that's OK because interrupt
+	 * handlers cannot touch user-space.
 	 */
 	ia64_set_kr(IA64_KR_PT_BASE, __pa(next->pgd));
 	activate_context(next);
-- 
cgit v1.2.3


From 9138d581b0ef855c0314c41c14852a7231b9941c Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Mon, 7 Nov 2005 11:27:13 -0800
Subject: [IA64] Extend notify_die() hooks for IA64

notify_die() added for MCA_{MONARCH,SLAVE,RENDEZVOUS}_{ENTER,PROCESS,LEAVE} and
INIT_{MONARCH,SLAVE}_{ENTER,PROCESS,LEAVE}.  We need multiple
notification points for these events because they can take many seconds
to run which has nasty effects on the behaviour of the rest of the
system.

DIE_SS replaced by a generic DIE_FAULT which checks the vector number,
to allow interception of faults other than SS.

DIE_MACHINE_{HALT,RESTART} added to allow last minute close down
processing, especially when the halt/restart routines are called from
error handlers.

DIE_OOPS added.

The check for kprobe's break numbers has been moved from traps.c to
kprobes.c, allowing DIE_BREAK to be used for any additional break
numbers, i.e. it is no longer kprobes specific.

Hooks for kernel debuggers and kernel dumpers added, ENTER and LEAVE.
Both of these disable the system for long periods which impact on
watchdogs and heartbeat systems in general.  More patches to come that
use these events to reset watchdogs and heartbeats.

unregister_die_notifier() added and both routines exported.  Requested
by Dean Nelson.

Lock removed from {un,}register_die_notifier.  notifier_chain_register()
already takes a lock.  Also the generic notifier chain locking is being
reworked to distinguish between callbacks that can block and those that
cannot, the lock in {un,}register_die_notifier would interfere with
that change.  http://marc.theaimsgroup.com/?l=linux-kernel&m=113018709002036&w=2

Leading white space removed from arch/ia64/kernel/kprobes.c.

Typo in mca.c in original version of this patch found & fixed by Dean
Nelson.

Signed-off-by: Keith Owens <kaos@sgi.com>
Acked-by: Dean Nelson <dcn@sgi.com>
Acked-by: Anil Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/kprobes.c |  22 +++++----
 arch/ia64/kernel/mca.c     | 120 +++++++++++++++++++++++++++++++++++----------
 arch/ia64/kernel/process.c |   6 +++
 arch/ia64/kernel/traps.c   |  44 ++++++++---------
 include/asm-ia64/kdebug.h  |  30 +++++++++++-
 5 files changed, 162 insertions(+), 60 deletions(-)

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 96736a119c9..801eeaeaf3d 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -347,7 +347,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 		((struct fnptr *)kretprobe_trampoline)->ip;
 
 	spin_lock_irqsave(&kretprobe_lock, flags);
-        head = kretprobe_inst_table_head(current);
+	head = kretprobe_inst_table_head(current);
 
 	/*
 	 * It is possible to have multiple instances associated with a given
@@ -363,9 +363,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 *       kretprobe_trampoline
 	 */
 	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
-                if (ri->task != current)
+		if (ri->task != current)
 			/* another task is sharing our hash bucket */
-                        continue;
+			continue;
 
 		if (ri->rp && ri->rp->handler)
 			ri->rp->handler(ri, regs);
@@ -394,7 +394,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	 * kprobe_handler() that we don't want the post_handler
 	 * to run (and have re-enabled preemption)
 	 */
-        return 1;
+	return 1;
 }
 
 /* Called with kretprobe_lock held */
@@ -739,12 +739,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 
 	switch(val) {
 	case DIE_BREAK:
-		if (pre_kprobes_handler(args))
-			ret = NOTIFY_STOP;
+		/* err is break number from ia64_bad_break() */
+		if (args->err == 0x80200 || args->err == 0x80300)
+			if (pre_kprobes_handler(args))
+				ret = NOTIFY_STOP;
 		break;
-	case DIE_SS:
-		if (post_kprobes_handler(args->regs))
-			ret = NOTIFY_STOP;
+	case DIE_FAULT:
+		/* err is vector number from ia64_fault() */
+		if (args->err == 36)
+			if (post_kprobes_handler(args->regs))
+				ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
 		/* kprobe_running() needs smp_processor_id() */
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 52c47da1724..355af15287c 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -51,6 +51,9 @@
  *
  * 2005-08-12 Keith Owens <kaos@sgi.com>
  *	      Convert MCA/INIT handlers to use per event stacks and SAL/OS state.
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
  */
 #include <linux/config.h>
 #include <linux/types.h>
@@ -58,7 +61,6 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
-#include <linux/kallsyms.h>
 #include <linux/smp_lock.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
@@ -69,6 +71,7 @@
 #include <linux/workqueue.h>
 
 #include <asm/delay.h>
+#include <asm/kdebug.h>
 #include <asm/machvec.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
@@ -132,6 +135,14 @@ extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
 
 static int mca_init;
 
+
+static void inline
+ia64_mca_spin(const char *func)
+{
+	printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
+	while (1)
+		cpu_relax();
+}
 /*
  * IA64_MCA log support
  */
@@ -526,13 +537,16 @@ ia64_mca_wakeup_all(void)
  *  Outputs :   None
  */
 static irqreturn_t
-ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
+ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *regs)
 {
 	unsigned long flags;
 	int cpu = smp_processor_id();
 
 	/* Mask all interrupts */
 	local_irq_save(flags);
+	if (notify_die(DIE_MCA_RENDZVOUS_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE;
 	/* Register with the SAL monarch that the slave has
@@ -540,10 +554,18 @@ ia64_mca_rendez_int_handler(int rendez_irq, void *arg, struct pt_regs *ptregs)
 	 */
 	ia64_sal_mc_rendez();
 
+	if (notify_die(DIE_MCA_RENDZVOUS_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Wait for the monarch cpu to exit. */
 	while (monarch_cpu != -1)
 	       cpu_relax();	/* spin until monarch leaves */
 
+	if (notify_die(DIE_MCA_RENDZVOUS_LEAVE, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+
 	/* Enable all interrupts */
 	local_irq_restore(flags);
 	return IRQ_HANDLED;
@@ -933,6 +955,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
 	monarch_cpu = cpu;
+	if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	ia64_wait_for_slaves(cpu);
 
 	/* Wakeup all the processors which are spinning in the rendezvous loop.
@@ -942,6 +967,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	 * spinning in SAL does not work.
 	 */
 	ia64_mca_wakeup_all();
+	if (notify_die(DIE_MCA_MONARCH_PROCESS, "MCA", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	/* Get the MCA error record and log it */
 	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
@@ -960,6 +988,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
 		sos->os_status = IA64_MCA_CORRECTED;
 	}
+	if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, 0, 0, recover)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	set_curr_task(cpu, previous_current);
 	monarch_cpu = -1;
@@ -1188,6 +1219,37 @@ ia64_mca_cpe_poll (unsigned long dummy)
 
 #endif /* CONFIG_ACPI */
 
+static int
+default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data)
+{
+	int c;
+	struct task_struct *g, *t;
+	if (val != DIE_INIT_MONARCH_PROCESS)
+		return NOTIFY_DONE;
+	printk(KERN_ERR "Processes interrupted by INIT -");
+	for_each_online_cpu(c) {
+		struct ia64_sal_os_state *s;
+		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
+		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
+		g = s->prev_task;
+		if (g) {
+			if (g->pid)
+				printk(" %d", g->pid);
+			else
+				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
+		}
+	}
+	printk("\n\n");
+	if (read_trylock(&tasklist_lock)) {
+		do_each_thread (g, t) {
+			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+			show_stack(t, NULL);
+		} while_each_thread (g, t);
+		read_unlock(&tasklist_lock);
+	}
+	return NOTIFY_DONE;
+}
+
 /*
  * C portion of the OS INIT handler
  *
@@ -1212,8 +1274,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	static atomic_t slaves;
 	static atomic_t monarchs;
 	task_t *previous_current;
-	int cpu = smp_processor_id(), c;
-	struct task_struct *g, *t;
+	int cpu = smp_processor_id();
 
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
 	console_loglevel = 15;	/* make sure printks make it to console */
@@ -1253,8 +1314,17 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT;
 		while (monarch_cpu == -1)
 		       cpu_relax();	/* spin until monarch enters */
+		if (notify_die(DIE_INIT_SLAVE_ENTER, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
+		if (notify_die(DIE_INIT_SLAVE_PROCESS, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		while (monarch_cpu != -1)
 		       cpu_relax();	/* spin until monarch leaves */
+		if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, 0, 0, 0)
+				== NOTIFY_STOP)
+			ia64_mca_spin(__FUNCTION__);
 		printk("Slave on cpu %d returning to normal service.\n", cpu);
 		set_curr_task(cpu, previous_current);
 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
@@ -1263,6 +1333,9 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	}
 
 	monarch_cpu = cpu;
+	if (notify_die(DIE_INIT_MONARCH_ENTER, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 
 	/*
 	 * Wait for a bit.  On some machines (e.g., HP's zx2000 and zx6000, INIT can be
@@ -1273,27 +1346,16 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 	printk("Delaying for 5 seconds...\n");
 	udelay(5*1000000);
 	ia64_wait_for_slaves(cpu);
-	printk(KERN_ERR "Processes interrupted by INIT -");
-	for_each_online_cpu(c) {
-		struct ia64_sal_os_state *s;
-		t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET);
-		s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET);
-		g = s->prev_task;
-		if (g) {
-			if (g->pid)
-				printk(" %d", g->pid);
-			else
-				printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g);
-		}
-	}
-	printk("\n\n");
-	if (read_trylock(&tasklist_lock)) {
-		do_each_thread (g, t) {
-			printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
-			show_stack(t, NULL);
-		} while_each_thread (g, t);
-		read_unlock(&tasklist_lock);
-	}
+	/* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
+	 * to default_monarch_init_process() above and just print all the
+	 * tasks.
+	 */
+	if (notify_die(DIE_INIT_MONARCH_PROCESS, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
+	if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, 0, 0, 0)
+			== NOTIFY_STOP)
+		ia64_mca_spin(__FUNCTION__);
 	printk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
 	atomic_dec(&monarchs);
 	set_curr_task(cpu, previous_current);
@@ -1462,6 +1524,10 @@ ia64_mca_init(void)
 	s64 rc;
 	struct ia64_sal_retval isrv;
 	u64 timeout = IA64_MCA_RENDEZ_TIMEOUT;	/* platform specific */
+	static struct notifier_block default_init_monarch_nb = {
+		.notifier_call = default_monarch_init_process,
+		.priority = 0/* we need to notified last */
+	};
 
 	IA64_MCA_DEBUG("%s: begin\n", __FUNCTION__);
 
@@ -1555,6 +1621,10 @@ ia64_mca_init(void)
 		       "(status %ld)\n", rc);
 		return;
 	}
+	if (register_die_notifier(&default_init_monarch_nb)) {
+		printk(KERN_ERR "Failed to register default monarch INIT process\n");
+		return;
+	}
 
 	IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __FUNCTION__);
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 051e050359e..c78355cb890 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -4,6 +4,9 @@
  * Copyright (C) 1998-2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  * 04/11/17 Ashok Raj	<ashok.raj@intel.com> Added CPU Hotplug Support
+ *
+ * 2005-10-07 Keith Owens <kaos@sgi.com>
+ *	      Add notify_die() hooks.
  */
 #define __KERNEL_SYSCALLS__	/* see <asm/unistd.h> */
 #include <linux/config.h>
@@ -34,6 +37,7 @@
 #include <asm/elf.h>
 #include <asm/ia32.h>
 #include <asm/irq.h>
+#include <asm/kdebug.h>
 #include <asm/pgalloc.h>
 #include <asm/processor.h>
 #include <asm/sal.h>
@@ -804,12 +808,14 @@ cpu_halt (void)
 void
 machine_restart (char *restart_cmd)
 {
+	(void) notify_die(DIE_MACHINE_RESTART, restart_cmd, NULL, 0, 0, 0);
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }
 
 void
 machine_halt (void)
 {
+	(void) notify_die(DIE_MACHINE_HALT, "", NULL, 0, 0, 0);
 	cpu_halt();
 }
 
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index f970359e7ed..fba5fdd1f96 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -30,17 +30,20 @@ fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
 struct notifier_block *ia64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
 
-int register_die_notifier(struct notifier_block *nb)
+int
+register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&ia64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return notifier_chain_register(&ia64die_chain, nb);
 }
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int
+unregister_die_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&ia64die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
 
 void __init
 trap_init (void)
@@ -105,6 +108,7 @@ die (const char *str, struct pt_regs *regs, long err)
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
 			current->comm, current->pid, str, err, ++die_counter);
+		(void) notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
 		show_regs(regs);
   	} else
 		printk(KERN_ERR "Recursive die() failure, output suppressed\n");
@@ -155,9 +159,8 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 	switch (break_num) {
 	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
 		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
+			       	== NOTIFY_STOP)
 			return;
-		}
 		die_if_kernel("bugcheck!", regs, break_num);
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
@@ -210,15 +213,6 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		sig = SIGILL; code = __ILL_BNDMOD;
 		break;
 
-	      case 0x80200:
-	      case 0x80300:
-		if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP)
-			       	== NOTIFY_STOP) {
-			return;
-		}
-		sig = SIGTRAP; code = TRAP_BRKPT;
-		break;
-
 	      default:
 		if (break_num < 0x40000 || break_num > 0x100000)
 			die_if_kernel("Bad break", regs, break_num);
@@ -226,6 +220,9 @@ __kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		if (break_num < 0x80000) {
 			sig = SIGILL; code = __ILL_BREAK;
 		} else {
+			if (notify_die(DIE_BREAK, "bad break", regs, break_num, TRAP_BRKPT, SIGTRAP)
+					== NOTIFY_STOP)
+				return;
 			sig = SIGTRAP; code = TRAP_BRKPT;
 		}
 	}
@@ -578,12 +575,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 #endif
 			break;
 		      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36:
-			      if (notify_die(DIE_SS, "ss", &regs, vector,
-					     vector, SIGTRAP) == NOTIFY_STOP)
-				      return;
-			      siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+		      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
 		}
+		if (notify_die(DIE_FAULT, "ia64_fault", &regs, vector, siginfo.si_code, SIGTRAP)
+			       	== NOTIFY_STOP)
+			return;
 		siginfo.si_signo = SIGTRAP;
 		siginfo.si_errno = 0;
 		siginfo.si_addr  = (void __user *) ifa;
diff --git a/include/asm-ia64/kdebug.h b/include/asm-ia64/kdebug.h
index 4d376e1663f..8b01a083dde 100644
--- a/include/asm-ia64/kdebug.h
+++ b/include/asm-ia64/kdebug.h
@@ -22,6 +22,9 @@
  * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
  *              <anil.s.keshavamurthy@intel.com> adopted from
  *              include/asm-x86_64/kdebug.h
+ *
+ * 2005-Oct	Keith Owens <kaos@sgi.com>.  Expand notify_die to cover more
+ *		events.
  */
 #include <linux/notifier.h>
 
@@ -35,13 +38,36 @@ struct die_args {
 	int signr;
 };
 
-int register_die_notifier(struct notifier_block *nb);
+extern int register_die_notifier(struct notifier_block *);
+extern int unregister_die_notifier(struct notifier_block *);
 extern struct notifier_block *ia64die_chain;
 
 enum die_val {
 	DIE_BREAK = 1,
-	DIE_SS,
+	DIE_FAULT,
+	DIE_OOPS,
 	DIE_PAGE_FAULT,
+	DIE_MACHINE_HALT,
+	DIE_MACHINE_RESTART,
+	DIE_MCA_MONARCH_ENTER,
+	DIE_MCA_MONARCH_PROCESS,
+	DIE_MCA_MONARCH_LEAVE,
+	DIE_MCA_SLAVE_ENTER,
+	DIE_MCA_SLAVE_PROCESS,
+	DIE_MCA_SLAVE_LEAVE,
+	DIE_MCA_RENDZVOUS_ENTER,
+	DIE_MCA_RENDZVOUS_PROCESS,
+	DIE_MCA_RENDZVOUS_LEAVE,
+	DIE_INIT_MONARCH_ENTER,
+	DIE_INIT_MONARCH_PROCESS,
+	DIE_INIT_MONARCH_LEAVE,
+	DIE_INIT_SLAVE_ENTER,
+	DIE_INIT_SLAVE_PROCESS,
+	DIE_INIT_SLAVE_LEAVE,
+	DIE_KDEBUG_ENTER,
+	DIE_KDEBUG_LEAVE,
+	DIE_KDUMP_ENTER,
+	DIE_KDUMP_LEAVE,
 };
 
 static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
-- 
cgit v1.2.3


From 97835245768a638002722a36ba9a3b76d0910f68 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 29 Oct 2005 17:23:05 -0400
Subject: [IA64] fix memory less node allocation

The original memory less node allocation attempted to use NODEDATA_ALIGN for
alignment.  The bootmem allocator only allows a power of two alignments. This
causes a BUG_ON for some nodes. For cpu only nodes just allocate with a
PERCPU_PAGE_SIZE alignment.

Some older firmware reports SLIT distances of 0xff and results in bestnode
not being computed. This is now treated correctly.

The failed allocation check was removed because it's redundant.  The
bootmem allocator already makes this check.

This fix has been boot tested on 4 node machine which has 4 cpu only nodes
and 1 memory node.  Thanks to Pete Keilty for reporting this and helping me
test it.

Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/mm/discontig.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index a88cdb7232f..0f776b032d3 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -350,14 +350,12 @@ static void __init initialize_pernode_data(void)
  *	for best.
  * @nid: node id
  * @pernodesize: size of this node's pernode data
- * @align: alignment to use for this node's pernode data
  */
-static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
-	unsigned long align)
+static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize)
 {
 	void *ptr = NULL;
 	u8 best = 0xff;
-	int bestnode = -1, node;
+	int bestnode = -1, node, anynode = 0;
 
 	for_each_online_node(node) {
 		if (node_isset(node, memory_less_mask))
@@ -366,13 +364,15 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
 			best = node_distance(nid, node);
 			bestnode = node;
 		}
+		anynode = node;
 	}
 
-	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
-		pernodesize, align, __pa(MAX_DMA_ADDRESS));
+	if (bestnode == -1)
+		bestnode = anynode;
+
+	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat, pernodesize,
+		PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 
-	if (!ptr)
-		panic("NO memory for memory less node\n");
 	return ptr;
 }
 
@@ -413,8 +413,7 @@ static void __init memory_less_nodes(void)
 
 	for_each_node_mask(node, memory_less_mask) {
 		pernodesize = compute_pernodesize(node);
-		pernode = memory_less_node_alloc(node, pernodesize,
-			(node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
+		pernode = memory_less_node_alloc(node, pernodesize);
 		fill_pernode(node, __pa(pernode), pernodesize);
 	}
 
-- 
cgit v1.2.3


From cf20d1eafb648bf395b153cbcd0cde40f88c220a Mon Sep 17 00:00:00 2001
From: David Mosberger-Tang <davidm@mostang.com>
Date: Wed, 2 Nov 2005 22:40:19 -0800
Subject: [IA64] align signal-frame even when not using alternate signal-stack

At the moment, attempting to invoke a signal-handler on the normal
stack is guaranteed to fail if the stack-pointer happens not to be
16-byte aligned.  This is because the signal-trampoline will attempt
to store fp-regs with stf.spill instructions, which will trap for
misaligned addresses.  This isn't terribly useful behavior.  It's
better to just always align the signal frame to the next lower 16-byte
boundary.

Signed-off-by: David Mosberger-Tang <David.Mosberger@acm.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/signal.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 774f34b675c..58ce07efc56 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -387,15 +387,14 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 	     struct sigscratch *scr)
 {
 	extern char __kernel_sigtramp[];
-	unsigned long tramp_addr, new_rbs = 0;
+	unsigned long tramp_addr, new_rbs = 0, new_sp;
 	struct sigframe __user *frame;
 	long err;
 
-	frame = (void __user *) scr->pt.r12;
+	new_sp = scr->pt.r12;
 	tramp_addr = (unsigned long) __kernel_sigtramp;
-	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags((unsigned long) frame) == 0) {
-		frame = (void __user *) ((current->sas_ss_sp + current->sas_ss_size)
-					 & ~(STACK_ALIGN - 1));
+	if ((ka->sa.sa_flags & SA_ONSTACK) && sas_ss_flags(new_sp) == 0) {
+		new_sp = current->sas_ss_sp + current->sas_ss_size;
 		/*
 		 * We need to check for the register stack being on the signal stack
 		 * separately, because it's switched separately (memory stack is switched
@@ -404,7 +403,7 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 		if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
 			new_rbs = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
 	}
-	frame = (void __user *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
+	frame = (void __user *) ((new_sp - sizeof(*frame)) & -STACK_ALIGN);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
 		return force_sigsegv_info(sig, frame);
-- 
cgit v1.2.3


From a14f25a076a8e5040d6f4e93f84034c81bcddbf7 Mon Sep 17 00:00:00 2001
From: Russ Anderson <rja@efs.americas.sgi.com>
Date: Fri, 4 Nov 2005 13:39:38 -0600
Subject: [IA64] MCA recovery based on PSP bits

The determination of whether an MCA is recoverable or not must
be based on the bits set in the PSP (Processor State Parameter).
The specific bits are shown in the Intel IA-64 Architecture Software
Developer's Manual, Vol 2, Table 11-6 Software Recovery Bits in
Processor State Parameter.  Those bits should be consistent
across the entire IA-64 family of processors.

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca_drv.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index f081c60ab20..eb860e29341 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -546,9 +546,20 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		(pal_processor_state_info_t*)peidx_psp(peidx);
 
 	/*
-	 * We cannot recover errors with other than bus_check.
+	 * Processor recovery status must key off of the PAL recovery
+	 * status in the Processor State Parameter.
 	 */
-	if (psp->cc || psp->rc || psp->uc)
+
+	/*
+	 * The machine check is corrected.
+	 */
+	if (psp->cm == 1)
+		return 1;
+
+	/*
+	 * The error was not contained.  Software must be reset.
+	 */
+	if (psp->us || psp->ci == 0)
 		return 0;
 
 	/*
@@ -569,8 +580,6 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
 		return 0;
 	if (pbci->eb && pbci->bsi > 0)
 		return 0;
-	if (psp->ci == 0)
-		return 0;
 
 	/*
 	 * This is a local MCA and estimated as recoverble external bus error.
-- 
cgit v1.2.3


From 4f41d5a4e665d05b4e74eef164469b7d81932ef1 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Mon, 7 Nov 2005 15:13:59 -0700
Subject: [IA64] add the MMIO regions that are translated to I/O port space to
 /proc/iomem

ia64 translates normal loads and stores to special MMIO regions into I/O port
accesses.  Reserve these special MMIO regions in /proc/iomem.

Sample /proc/iomem:
    f8100000000-f81003fffff : PCI Bus 0000:80 I/O Ports 00000000-00000fff
    f8100400000-f81007fffff : PCI Bus 0000:8e I/O Ports 00001000-00001fff
    f8100800000-f8100ffffff : PCI Bus 0000:9c I/O Ports 00002000-00003fff
    f8101000000-f81017fffff : PCI Bus 0000:aa I/O Ports 00004000-00005fff

and corresponding /proc/ioports:
    00000000-00000fff : PCI Bus 0000:80
    00001000-00001fff : PCI Bus 0000:8e
    00002000-00003fff : PCI Bus 0000:9c
    00004000-00005fff : PCI Bus 0000:aa

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/pci/pci.c | 106 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 82 insertions(+), 24 deletions(-)

diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 017cfc3f478..20d76fae24e 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -95,7 +95,7 @@ pci_sal_write (unsigned int seg, unsigned int bus, unsigned int devfn,
 }
 
 static struct pci_raw_ops pci_sal_ops = {
-	.read = 	pci_sal_read,
+	.read =		pci_sal_read,
 	.write =	pci_sal_write
 };
 
@@ -137,35 +137,98 @@ alloc_pci_controller (int seg)
 	return controller;
 }
 
-static u64 __devinit
-add_io_space (struct acpi_resource_address64 *addr)
+struct pci_root_info {
+	struct pci_controller *controller;
+	char *name;
+};
+
+static unsigned int
+new_space (u64 phys_base, int sparse)
 {
-	u64 offset;
-	int sparse = 0;
+	u64 mmio_base;
 	int i;
 
-	if (addr->address_translation_offset == 0)
-		return IO_SPACE_BASE(0);	/* part of legacy IO space */
-
-	if (addr->attribute.io.translation_attribute == ACPI_SPARSE_TRANSLATION)
-		sparse = 1;
+	if (phys_base == 0)
+		return 0;	/* legacy I/O port space */
 
-	offset = (u64) ioremap(addr->address_translation_offset, 0);
+	mmio_base = (u64) ioremap(phys_base, 0);
 	for (i = 0; i < num_io_spaces; i++)
-		if (io_space[i].mmio_base == offset &&
+		if (io_space[i].mmio_base == mmio_base &&
 		    io_space[i].sparse == sparse)
-			return IO_SPACE_BASE(i);
+			return i;
 
 	if (num_io_spaces == MAX_IO_SPACES) {
-		printk("Too many IO port spaces\n");
+		printk(KERN_ERR "PCI: Too many IO port spaces "
+			"(MAX_IO_SPACES=%lu)\n", MAX_IO_SPACES);
 		return ~0;
 	}
 
 	i = num_io_spaces++;
-	io_space[i].mmio_base = offset;
+	io_space[i].mmio_base = mmio_base;
 	io_space[i].sparse = sparse;
 
-	return IO_SPACE_BASE(i);
+	return i;
+}
+
+static u64 __devinit
+add_io_space (struct pci_root_info *info, struct acpi_resource_address64 *addr)
+{
+	struct resource *resource;
+	char *name;
+	u64 base, min, max, base_port;
+	unsigned int sparse = 0, space_nr, len;
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		printk(KERN_ERR "PCI: No memory for %s I/O port space\n",
+			info->name);
+		goto out;
+	}
+
+	len = strlen(info->name) + 32;
+	name = kzalloc(len, GFP_KERNEL);
+	if (!name) {
+		printk(KERN_ERR "PCI: No memory for %s I/O port space name\n",
+			info->name);
+		goto free_resource;
+	}
+
+	min = addr->min_address_range;
+	max = min + addr->address_length - 1;
+	if (addr->attribute.io.translation_attribute == ACPI_SPARSE_TRANSLATION)
+		sparse = 1;
+
+	space_nr = new_space(addr->address_translation_offset, sparse);
+	if (space_nr == ~0)
+		goto free_name;
+
+	base = __pa(io_space[space_nr].mmio_base);
+	base_port = IO_SPACE_BASE(space_nr);
+	snprintf(name, len, "%s I/O Ports %08lx-%08lx", info->name,
+		base_port + min, base_port + max);
+
+	/*
+	 * The SDM guarantees the legacy 0-64K space is sparse, but if the
+	 * mapping is done by the processor (not the bridge), ACPI may not
+	 * mark it as sparse.
+	 */
+	if (space_nr == 0)
+		sparse = 1;
+
+	resource->name  = name;
+	resource->flags = IORESOURCE_MEM;
+	resource->start = base + (sparse ? IO_SPACE_SPARSE_ENCODING(min) : min);
+	resource->end   = base + (sparse ? IO_SPACE_SPARSE_ENCODING(max) : max);
+	insert_resource(&iomem_resource, resource);
+
+	return base_port;
+
+free_name:
+	kfree(name);
+free_resource:
+	kfree(resource);
+out:
+	return ~0;
 }
 
 static acpi_status __devinit resource_to_window(struct acpi_resource *resource,
@@ -205,11 +268,6 @@ count_window (struct acpi_resource *resource, void *data)
 	return AE_OK;
 }
 
-struct pci_root_info {
-	struct pci_controller *controller;
-	char *name;
-};
-
 static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 {
 	struct pci_root_info *info = data;
@@ -231,7 +289,7 @@ static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 	} else if (addr.resource_type == ACPI_IO_RANGE) {
 		flags = IORESOURCE_IO;
 		root = &ioport_resource;
-		offset = add_io_space(&addr);
+		offset = add_io_space(info, &addr);
 		if (offset == ~0)
 			return AE_OK;
 	} else
@@ -241,7 +299,7 @@ static __devinit acpi_status add_window(struct acpi_resource *res, void *data)
 	window->resource.name = info->name;
 	window->resource.flags = flags;
 	window->resource.start = addr.min_address_range + offset;
-	window->resource.end = addr.max_address_range + offset;
+	window->resource.end = window->resource.start + addr.address_length - 1;
 	window->resource.child = NULL;
 	window->offset = offset;
 
@@ -739,7 +797,7 @@ int pci_vector_resources(int last, int nr_released)
 {
 	int count = nr_released;
 
- 	count += (IA64_LAST_DEVICE_VECTOR - last);
+	count += (IA64_LAST_DEVICE_VECTOR - last);
 
 	return count;
 }
-- 
cgit v1.2.3


From baf47fb66020e5c3fe2386680fa2d79d1f8e0052 Mon Sep 17 00:00:00 2001
From: Panagiotis Issaris <takis@issaris.org>
Date: Wed, 9 Nov 2005 02:08:42 +0100
Subject: [IA64] Replace kcalloc(1, with kzalloc.

Conversion from kcalloc(1, to kzalloc.

Signed-off-by: Panagiotis Issaris <takis@issaris.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/efi.c            | 2 +-
 arch/ia64/sn/kernel/io_init.c     | 2 +-
 arch/ia64/sn/pci/tioce_provider.c | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index f72ea6aebcb..a3aa45cbcfa 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -987,7 +987,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				break;
 		}
 
-		if ((res = kcalloc(1, sizeof(struct resource), GFP_KERNEL)) == NULL) {
+		if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL) {
 			printk(KERN_ERR "failed to alocate resource for iomem\n");
 			return;
 		}
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index b4f5053f5e1..05e4ea88998 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -349,7 +349,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 		return;		/*bus # does not exist */
 	prom_bussoft_ptr = __va(prom_bussoft_ptr);
 
- 	controller = kcalloc(1,sizeof(struct pci_controller), GFP_KERNEL);
+ 	controller = kzalloc(sizeof(struct pci_controller), GFP_KERNEL);
 	controller->segment = segment;
  	if (!controller)
  		BUG();
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index 9f03d4e5121..dda196c9e32 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -218,7 +218,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	if (i > last)
 		return 0;
 
-	map = kcalloc(1, sizeof(struct tioce_dmamap), GFP_ATOMIC);
+	map = kzalloc(sizeof(struct tioce_dmamap), GFP_ATOMIC);
 	if (!map)
 		return 0;
 
@@ -555,7 +555,7 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	struct tioce *tioce_mmr;
 	struct tioce_kernel *tioce_kern;
 
-	tioce_kern = kcalloc(1, sizeof(struct tioce_kernel), GFP_KERNEL);
+	tioce_kern = kzalloc(sizeof(struct tioce_kernel), GFP_KERNEL);
 	if (!tioce_kern) {
 		return NULL;
 	}
@@ -727,7 +727,7 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	 * Allocate kernel bus soft and copy from prom.
 	 */
 
-	tioce_common = kcalloc(1, sizeof(struct tioce_common), GFP_KERNEL);
+	tioce_common = kzalloc(sizeof(struct tioce_common), GFP_KERNEL);
 	if (!tioce_common)
 		return NULL;
 
-- 
cgit v1.2.3


From 780d09e895032207a6b070a44d392a3c60574b70 Mon Sep 17 00:00:00 2001
From: Dean Nelson <dcn@sgi.com>
Date: Wed, 9 Nov 2005 14:41:57 -0600
Subject: [IA64] utilize notify_die() for XPC disengage

XPC (as in arch/ia64/sn/kernel/xp*) has a need to notify other partitions
(SGI Altix) whenever a partition is going down in order to get them to
disengage from accessing the halting partition's memory. If this is not
done before the reset of the hardware, the other partitions can find
themselves encountering MCAs that bring them down.

Signed-off-by: Dean Nelson <dcn@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/xpc.h           |   2 +-
 arch/ia64/sn/kernel/xpc_main.c      | 102 ++++++++++++++++++++++++++++++++++++
 arch/ia64/sn/kernel/xpc_partition.c |   8 +--
 3 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/sn/kernel/xpc.h b/arch/ia64/sn/kernel/xpc.h
index fbcedc7c27f..5483a9f227d 100644
--- a/arch/ia64/sn/kernel/xpc.h
+++ b/arch/ia64/sn/kernel/xpc.h
@@ -163,7 +163,7 @@ struct xpc_vars {
 	u8 version;
 	u64 heartbeat;
 	u64 heartbeating_to_mask;
-	u64 kdb_status;		/* 0 = machine running */
+	u64 heartbeat_offline;	/* if 0, heartbeat should be changing */
 	int act_nasid;
 	int act_phys_cpuid;
 	u64 vars_part_pa;
diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c
index cece3c7c69b..b617236524c 100644
--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -57,6 +57,7 @@
 #include <linux/reboot.h>
 #include <asm/sn/intr.h>
 #include <asm/sn/sn_sal.h>
+#include <asm/kdebug.h>
 #include <asm/uaccess.h>
 #include "xpc.h"
 
@@ -188,6 +189,11 @@ static struct notifier_block xpc_reboot_notifier = {
 	.notifier_call = xpc_system_reboot,
 };
 
+static int xpc_system_die(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xpc_die_notifier = {
+	.notifier_call = xpc_system_die,
+};
+
 
 /*
  * Timer function to enforce the timelimit on the partition disengage request.
@@ -997,6 +1003,9 @@ xpc_do_exit(enum xpc_retval reason)
 	/* take ourselves off of the reboot_notifier_list */
 	(void) unregister_reboot_notifier(&xpc_reboot_notifier);
 
+	/* take ourselves off of the die_notifier list */
+	(void) unregister_die_notifier(&xpc_die_notifier);
+
 	/* close down protections for IPI operations */
 	xpc_restrict_IPI_ops();
 
@@ -1010,6 +1019,63 @@ xpc_do_exit(enum xpc_retval reason)
 }
 
 
+/*
+ * Called when the system is about to be either restarted or halted.
+ */
+static void
+xpc_die_disengage(void)
+{
+	struct xpc_partition *part;
+	partid_t partid;
+	unsigned long engaged;
+	long time, print_time, disengage_request_timeout;
+
+
+	/* keep xpc_hb_checker thread from doing anything (just in case) */
+	xpc_exiting = 1;
+
+	xpc_vars->heartbeating_to_mask = 0;  /* indicate we're deactivated */
+
+	for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
+		part = &xpc_partitions[partid];
+
+		if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
+							remote_vars_version)) {
+
+			/* just in case it was left set by an earlier XPC */
+			xpc_clear_partition_engaged(1UL << partid);
+			continue;
+		}
+
+		if (xpc_partition_engaged(1UL << partid) ||
+					part->act_state != XPC_P_INACTIVE) {
+			xpc_request_partition_disengage(part);
+			xpc_mark_partition_disengaged(part);
+			xpc_IPI_send_disengage(part);
+		}
+	}
+
+	print_time = rtc_time();
+	disengage_request_timeout = print_time +
+		(xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
+
+	/* wait for all other partitions to disengage from us */
+
+	while ((engaged = xpc_partition_engaged(-1UL)) &&
+			(time = rtc_time()) < disengage_request_timeout) {
+
+		if (time >= print_time) {
+			dev_info(xpc_part, "waiting for remote partitions to "
+				"disengage, engaged=0x%lx\n", engaged);
+			print_time = time + (XPC_DISENGAGE_PRINTMSG_INTERVAL *
+						sn_rtc_cycles_per_second);
+		}
+	}
+	dev_info(xpc_part, "finished waiting for remote partitions to "
+				"disengage, engaged=0x%lx\n", engaged);
+}
+
+
 /*
  * This function is called when the system is being rebooted.
  */
@@ -1038,6 +1104,33 @@ xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
 }
 
 
+/*
+ * This function is called when the system is being rebooted.
+ */
+static int
+xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
+{
+	switch (event) {
+	case DIE_MACHINE_RESTART:
+	case DIE_MACHINE_HALT:
+		xpc_die_disengage();
+		break;
+	case DIE_MCA_MONARCH_ENTER:
+	case DIE_INIT_MONARCH_ENTER:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 1;
+		break;
+	case DIE_MCA_MONARCH_LEAVE:
+	case DIE_INIT_MONARCH_LEAVE:
+		xpc_vars->heartbeat++;
+		xpc_vars->heartbeat_offline = 0;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
 int __init
 xpc_init(void)
 {
@@ -1154,6 +1247,12 @@ xpc_init(void)
 		dev_warn(xpc_part, "can't register reboot notifier\n");
 	}
 
+	/* add ourselves to the die_notifier list (i.e., ia64die_chain) */
+	ret = register_die_notifier(&xpc_die_notifier);
+	if (ret != 0) {
+		dev_warn(xpc_part, "can't register die notifier\n");
+	}
+
 
 	/*
 	 * Set the beating to other partitions into motion.  This is
@@ -1179,6 +1278,9 @@ xpc_init(void)
 		/* take ourselves off of the reboot_notifier_list */
 		(void) unregister_reboot_notifier(&xpc_reboot_notifier);
 
+		/* take ourselves off of the die_notifier list */
+		(void) unregister_die_notifier(&xpc_die_notifier);
+
 		del_timer_sync(&xpc_hb_timer);
 		free_irq(SGI_XPC_ACTIVATE, NULL);
 		xpc_restrict_IPI_ops();
diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c
index 581e113d2d3..cdd6431853a 100644
--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -436,13 +436,13 @@ xpc_check_remote_hb(void)
 		}
 
 		dev_dbg(xpc_part, "partid = %d, heartbeat = %ld, last_heartbeat"
-			" = %ld, kdb_status = %ld, HB_mask = 0x%lx\n", partid,
-			remote_vars->heartbeat, part->last_heartbeat,
-			remote_vars->kdb_status,
+			" = %ld, heartbeat_offline = %ld, HB_mask = 0x%lx\n",
+			partid, remote_vars->heartbeat, part->last_heartbeat,
+			remote_vars->heartbeat_offline,
 			remote_vars->heartbeating_to_mask);
 
 		if (((remote_vars->heartbeat == part->last_heartbeat) &&
-			(remote_vars->kdb_status == 0)) ||
+			(remote_vars->heartbeat_offline == 0)) ||
 			     !xpc_hb_allowed(sn_partition_id, remote_vars)) {
 
 			XPC_DEACTIVATE_PARTITION(part, xpcNoHeartbeat);
-- 
cgit v1.2.3