From ac6b931c44fd9988eaa821c339d54ba06b212412 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:19 -0700
Subject: [PATCH] x86_64: Reduce NMI watchdog stack usage

NR_CPUs can be quite big these days.  kmalloc the per CPU array instead of
putting it onto the stack

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/nmi.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 61de0b34a01..ec13eb97e8e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -114,7 +114,7 @@ static __init int cpu_has_lapic(void)
 
 static int __init check_nmi_watchdog (void)
 {
-	int counts[NR_CPUS];
+	int *counts;
 	int cpu;
 
 	if (nmi_watchdog == NMI_NONE)
@@ -125,6 +125,12 @@ static int __init check_nmi_watchdog (void)
 		return -1; 
 	}	
 
+	counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL);
+	if (!counts) {
+		nmi_watchdog = NMI_NONE;
+		return 0;
+	}
+
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
@@ -139,6 +145,7 @@ static int __init check_nmi_watchdog (void)
 			       cpu_pda[cpu].__nmi_count);
 			nmi_active = 0;
 			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+			kfree(counts);
 			return -1;
 		}
 	}
@@ -149,6 +156,7 @@ static int __init check_nmi_watchdog (void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		nmi_hz = 1;
 
+	kfree(counts);
 	return 0;
 }
 /* Have this called later during boot so counters are updating */
-- 
cgit v1.2.3


From 10ffdbb8d605be88b148f127ec86452f1364d4f0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:19 -0700
Subject: [PATCH] x86_64: Readd missing tests in entry.S

Cleans up the system exit call slightly and synchronizes with my tree again.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/entry.S | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 1086b5fcac2..28817490fdc 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -220,13 +220,18 @@ sysret_careful:
 	jmp sysret_check
 
 	/* Handle a signal */ 
-	/* edx:	work flags (arg3) */
 sysret_signal:
 	sti
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    1f
+
+	/* Really a signal */
+	/* edx:	work flags (arg3) */
 	leaq do_notify_resume(%rip),%rax
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
+1:	movl $_TIF_NEED_RESCHED,%edi
 	jmp sysret_check
 	
 	/* Do syscall tracing */
@@ -484,6 +489,8 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	jz    retint_swapgs
 	sti
 	SAVE_REST
 	movq $-1,ORIG_RAX(%rsp) 			
@@ -492,8 +499,8 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
+	movl $_TIF_NEED_RESCHED,%edi
 	GET_THREAD_INFO(%rcx)
-	movl $_TIF_WORK_MASK,%edi
 	jmp retint_check
 
 #ifdef CONFIG_PREEMPT
-- 
cgit v1.2.3


From a158608bf4c6260caf26089b00a000851e11357a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:21 -0700
Subject: [PATCH] x86_64/i386: fix defaults for physical/core id in
 /proc/cpuinfo

Last round hopefully of cpu_core_id changes hopefully fow now:

- Always initialize cpu_core_id for all CPUs, even when no dual core setup
  is detected.  This prevents funny /proc/cpuinfo output

- Do the same with phys_proc_id[] even when no HyperThreading - dito.

- Use the CPU APIC-ID from CPUID 1 instead of the linux virtual CPU number
  to identify the core for AMD dual core setups.

Patch for i386/x86-64.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 2129cf9ba6b..aab249a5746 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -731,7 +731,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 	int node = 0;
 	if (c->x86_num_cores == 1)
 		return;
-	cpu_core_id[cpu] = cpu >> hweight32(c->x86_num_cores - 1);
+	/* Fix up the APIC ID following the AMD specification. */
+ 	cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1);
 
 #ifdef CONFIG_NUMA
 	/* When an ACPI SRAT table is available use the mappings from SRAT
@@ -745,6 +746,9 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 		node = cpu_to_node[cpu];
 	}
 #endif
+	/* For now: - better than BAD_APIC_ID at least*/
+	phys_proc_id[cpu] = cpu_core_id[cpu];
+
 	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
 			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
 #endif
@@ -959,6 +963,11 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		/* Have CPUID level 0 only - unheard of */
 		c->x86 = 4;
 	}
+
+#ifdef CONFIG_SMP
+	phys_proc_id[smp_processor_id()] =
+	cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
 }
 
 /*
-- 
cgit v1.2.3


From dda50e716dc9451f40eebfb2902c260e4f62cf34 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:25 -0700
Subject: [PATCH] x86_64: Update TSC sync algorithm

The new TSC sync algorithm recently submitted did not work too well.

The result was that some MP machines where the TSC came up of the BIOS very
unsynchronized and that did not have HPET support were nearly unusable because
the time would jump forwards and backwards between CPUs.

After a lot of research ;-) and some more prototypes I ended up with just
using the one from IA64 which looks best.  It has some internal self tuning
that should adapt to changing interconnect latencies.  It holds up in my tests
so far.

I believe it was originally written by David Mosberger, I just ported it over
to x86-64.  See the inline comment for a description.

This cleans up the code because it uses smp_call_function for syncing instead
of having custom hooks in SMP bootup.

Please note that the cycle numbers it outputs are too optimistic because they
do not take into account the latency of WRMSR and RDTSC, which can be hundreds
of cycles.  It seems to be able to sync a dual Opteron to 200-300 cycles,
which is probably good enough.

There is a timing window during AP bootup where interrupts can see
inconsistent time before the TSC is synced.  It is hard to avoid unfortunately
because we can only do the TSC sync after some setup, and we need to enable
interrupts before that.  I just ignored it for now.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 257 ++++++++++++++++++++++++++++++-------------
 1 file changed, 182 insertions(+), 75 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 73f7e8b9543..17efc40baa3 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -125,96 +125,210 @@ static void __cpuinit smp_store_cpu_info(int id)
 
 	*c = boot_cpu_data;
 	identify_cpu(c);
+	print_cpu_info(c);
 }
 
 /*
- * Synchronize TSCs of CPUs
+ * New Funky TSC sync algorithm borrowed from IA64.
+ * Main advantage is that it doesn't reset the TSCs fully and
+ * in general looks more robust and it works better than my earlier
+ * attempts. I believe it was written by David Mosberger. Some minor
+ * adjustments for x86-64 by me -AK
  *
- * This new algorithm is less accurate than the old "zero TSCs"
- * one, but we cannot zero TSCs anymore in the new hotplug CPU
- * model.
+ * Original comment reproduced below.
+ *
+ * Synchronize TSC of the current (slave) CPU with the TSC of the
+ * MASTER CPU (normally the time-keeper CPU).  We use a closed loop to
+ * eliminate the possibility of unaccounted-for errors (such as
+ * getting a machine check in the middle of a calibration step).  The
+ * basic idea is for the slave to ask the master what itc value it has
+ * and to read its own itc before and after the master responds.  Each
+ * iteration gives us three timestamps:
+ *
+ *	slave		master
+ *
+ *	t0 ---\
+ *             ---\
+ *		   --->
+ *			tm
+ *		   /---
+ *	       /---
+ *	t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly
+ * half-way between t0 and t1.  If we achieve this, the clocks are
+ * synchronized provided the interconnect between the slave and the
+ * master is symmetric.  Even if the interconnect were asymmetric, we
+ * would still know that the synchronization error is smaller than the
+ * roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us
+ * synchronize the TSC to within one or two cycles.  However, we can
+ * only *guarantee* that the synchronization is accurate to within a
+ * round-trip time, which is typically in the range of several hundred
+ * cycles (e.g., ~500 cycles).  In practice, this means that the TSCs
+ * are usually almost perfectly synchronized, but we shouldn't assume
+ * that the accuracy is much better than half a micro second or so.
+ *
+ * [there are other errors like the latency of RDTSC and of the
+ * WRMSR. These can also account to hundreds of cycles. So it's
+ * probably worse. It claims 153 cycles error on a dual Opteron,
+ * but I suspect the numbers are actually somewhat worse -AK]
  */
 
-static atomic_t __cpuinitdata tsc_flag;
+#define MASTER	0
+#define SLAVE	(SMP_CACHE_BYTES/8)
+
+/* Intentionally don't use cpu_relax() while TSC synchronization
+   because we don't want to go into funky power save modi or cause
+   hypervisors to schedule us away.  Going to sleep would likely affect
+   latency and low latency is the primary objective here. -AK */
+#define no_cpu_relax() barrier()
+
 static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
-static unsigned long long __cpuinitdata bp_tsc, ap_tsc;
+static volatile __cpuinitdata unsigned long go[SLAVE + 1];
+static int notscsync __cpuinitdata;
+
+#undef DEBUG_TSC_SYNC
 
-#define NR_LOOPS 5
+#define NUM_ROUNDS	64	/* magic value */
+#define NUM_ITERS	5	/* likewise */
 
-static void __cpuinit sync_tsc_bp_init(int init)
+/* Callback on boot CPU */
+static __cpuinit void sync_master(void *arg)
 {
-	if (init)
-		_raw_spin_lock(&tsc_sync_lock);
-	else
-		_raw_spin_unlock(&tsc_sync_lock);
-	atomic_set(&tsc_flag, 0);
+	unsigned long flags, i;
+
+	if (smp_processor_id() != boot_cpu_id)
+		return;
+
+	go[MASTER] = 0;
+
+	local_irq_save(flags);
+	{
+		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
+			while (!go[MASTER])
+				no_cpu_relax();
+			go[MASTER] = 0;
+			rdtscll(go[SLAVE]);
+		}
+	}
+	local_irq_restore(flags);
 }
 
 /*
- * Synchronize TSC on AP with BP.
+ * Return the number of cycles by which our tsc differs from the tsc
+ * on the master (time-keeper) CPU.  A positive number indicates our
+ * tsc is ahead of the master, negative that it is behind.
  */
-static void __cpuinit __sync_tsc_ap(void)
+static inline long
+get_delta(long *rt, long *master)
 {
-	if (!cpu_has_tsc)
-		return;
-	Dprintk("AP %d syncing TSC\n", smp_processor_id());
+	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+	unsigned long tcenter, t0, t1, tm;
+	int i;
 
-	while (atomic_read(&tsc_flag) != 0)
-		cpu_relax();
-	atomic_inc(&tsc_flag);
-	mb();
-	_raw_spin_lock(&tsc_sync_lock);
-	wrmsrl(MSR_IA32_TSC, bp_tsc);
-	_raw_spin_unlock(&tsc_sync_lock);
-	rdtscll(ap_tsc);
-	mb();
-	atomic_inc(&tsc_flag);
-	mb();
+	for (i = 0; i < NUM_ITERS; ++i) {
+		rdtscll(t0);
+		go[MASTER] = 1;
+		while (!(tm = go[SLAVE]))
+			no_cpu_relax();
+		go[SLAVE] = 0;
+		rdtscll(t1);
+
+		if (t1 - t0 < best_t1 - best_t0)
+			best_t0 = t0, best_t1 = t1, best_tm = tm;
+	}
+
+	*rt = best_t1 - best_t0;
+	*master = best_tm - best_t0;
+
+	/* average best_t0 and best_t1 without overflow: */
+	tcenter = (best_t0/2 + best_t1/2);
+	if (best_t0 % 2 + best_t1 % 2 == 2)
+		++tcenter;
+	return tcenter - best_tm;
 }
 
-static void __cpuinit sync_tsc_ap(void)
+static __cpuinit void sync_tsc(void)
 {
-	int i;
-	for (i = 0; i < NR_LOOPS; i++)
-		__sync_tsc_ap();
+	int i, done = 0;
+	long delta, adj, adjust_latency = 0;
+	unsigned long flags, rt, master_time_stamp, bound;
+#if DEBUG_TSC_SYNC
+	static struct syncdebug {
+		long rt;	/* roundtrip time */
+		long master;	/* master's timestamp */
+		long diff;	/* difference between midpoint and master's timestamp */
+		long lat;	/* estimate of tsc adjustment latency */
+	} t[NUM_ROUNDS] __cpuinitdata;
+#endif
+
+	go[MASTER] = 1;
+
+	smp_call_function(sync_master, NULL, 1, 0);
+
+	while (go[MASTER])	/* wait for master to be ready */
+		no_cpu_relax();
+
+	spin_lock_irqsave(&tsc_sync_lock, flags);
+	{
+		for (i = 0; i < NUM_ROUNDS; ++i) {
+			delta = get_delta(&rt, &master_time_stamp);
+			if (delta == 0) {
+				done = 1;	/* let's lock on to this... */
+				bound = rt;
+			}
+
+			if (!done) {
+				unsigned long t;
+				if (i > 0) {
+					adjust_latency += -delta;
+					adj = -delta + adjust_latency/4;
+				} else
+					adj = -delta;
+
+				rdtscll(t);
+				wrmsrl(MSR_IA32_TSC, t + adj);
+			}
+#if DEBUG_TSC_SYNC
+			t[i].rt = rt;
+			t[i].master = master_time_stamp;
+			t[i].diff = delta;
+			t[i].lat = adjust_latency/4;
+#endif
+		}
+	}
+	spin_unlock_irqrestore(&tsc_sync_lock, flags);
+
+#if DEBUG_TSC_SYNC
+	for (i = 0; i < NUM_ROUNDS; ++i)
+		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+#endif
+
+	printk(KERN_INFO
+	       "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, "
+	       "maxerr %lu cycles)\n",
+	       smp_processor_id(), boot_cpu_id, delta, rt);
 }
 
-/*
- * Synchronize TSC from BP to AP.
- */
-static void __cpuinit __sync_tsc_bp(int cpu)
+static void __cpuinit tsc_sync_wait(void)
 {
-	if (!cpu_has_tsc)
+	if (notscsync || !cpu_has_tsc)
 		return;
-
-	/* Wait for AP */
-	while (atomic_read(&tsc_flag) == 0)
-		cpu_relax();
-	/* Save BPs TSC */
-	sync_core();
-	rdtscll(bp_tsc);
-	/* Don't do the sync core here to avoid too much latency. */
-	mb();
-	/* Start the AP */
-	_raw_spin_unlock(&tsc_sync_lock);
-	/* Wait for AP again */
-	while (atomic_read(&tsc_flag) < 2)
-		cpu_relax();
-	rdtscl(bp_tsc);
-	barrier();
+	printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(),
+			boot_cpu_id);
+	sync_tsc();
 }
 
-static void __cpuinit sync_tsc_bp(int cpu)
+static __init int notscsync_setup(char *s)
 {
-	int i;
-	for (i = 0; i < NR_LOOPS - 1; i++) {
-		__sync_tsc_bp(cpu);
-		sync_tsc_bp_init(1);
-	}
-	__sync_tsc_bp(cpu);
-	printk(KERN_INFO "Synced TSC of CPU %d difference %Ld\n",
-	       cpu, ap_tsc - bp_tsc);
+	notscsync = 1;
+	return 0;
 }
+__setup("notscsync", notscsync_setup);
 
 static atomic_t init_deasserted __cpuinitdata;
 
@@ -315,11 +429,6 @@ void __cpuinit start_secondary(void)
 	cpu_init();
 	smp_callin();
 
-	/*
-	 * Synchronize the TSC with the BP
-	 */
-	sync_tsc_ap();
-
 	/* otherwise gcc will move up the smp_processor_id before the cpu_init */
 	barrier();
 
@@ -334,7 +443,6 @@ void __cpuinit start_secondary(void)
 		enable_8259A_irq(0);
 	}
 
-
 	enable_APIC_timer();
 
 	/*
@@ -343,6 +451,11 @@ void __cpuinit start_secondary(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	mb();
 
+	/* Wait for TSC sync to not schedule things before.
+	   We still process interrupts, which could see an inconsistent
+	   time in that window unfortunately. */
+	tsc_sync_wait();
+
 	cpu_idle();
 }
 
@@ -600,8 +713,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 
 		if (cpu_isset(cpu, cpu_callin_map)) {
 			/* number CPUs logically, starting from 1 (BSP is 0) */
-			Dprintk("OK.\n");
-			print_cpu_info(&cpu_data[cpu]);
 			Dprintk("CPU has booted.\n");
 		} else {
 			boot_error = 1;
@@ -889,18 +1000,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		printk("__cpu_up: bad cpu %d\n", cpu);
 		return -EINVAL;
 	}
-	sync_tsc_bp_init(1);
 
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
-		sync_tsc_bp_init(0);
 		Dprintk("do_boot_cpu failed %d\n", err);
 		return err;
 	}
 
-	sync_tsc_bp(cpu);
-
 	/* Unleash the CPU! */
 	Dprintk("waiting for cpu %d\n", cpu);
 
-- 
cgit v1.2.3


From 2942283e970b357c146ebdcbbcc0bdf5048615ff Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:26 -0700
Subject: [PATCH] x86_64: Remove x86_apicid field

Remove x86_apicid field

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index aab249a5746..966dfc768cb 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -727,7 +727,7 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
 static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	int cpu = c->x86_apicid;
+	int cpu = smp_processor_id();
 	int node = 0;
 	if (c->x86_num_cores == 1)
 		return;
@@ -929,7 +929,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_clflush_size = 64;
 	c->x86_cache_alignment = c->x86_clflush_size;
 	c->x86_num_cores = 1;
-	c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data;
 	c->extended_cpuid_level = 0;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
@@ -958,7 +957,6 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 		} 
 		if (c->x86_capability[0] & (1<<19)) 
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-		c->x86_apicid = misc >> 24;
 	} else {
 		/* Have CPUID level 0 only - unheard of */
 		c->x86 = 4;
-- 
cgit v1.2.3


From 622dcaf974668f7dee252fc6a8f9a2710c2070f4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:26 -0700
Subject: [PATCH] x86_64: Don't print the internal k8c+ flag in /proc/cpuinfo

It is not very useful to the user and more an kernel internal implementation
detail.  So hide it.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 966dfc768cb..b9fd0252c27 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -1095,7 +1095,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 
 		/* Other (Linux-defined) */
-		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", "k8c+",
+		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
 		"constant_tsc", NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-- 
cgit v1.2.3


From 0af2be0b721997512191e981a051fcb070b87260 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:27 -0700
Subject: [PATCH] x86_64: Remove unique APIC/IO-APIC ID check

It is unnecessary on modern Intel or AMD systems, and that is all we support
on x86-64

Also causes problems on various systems

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 70 --------------------------------------------
 arch/x86_64/kernel/mpparse.c |  2 +-
 2 files changed, 1 insertion(+), 71 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 60be58617eb..ac768432495 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -1804,76 +1804,6 @@ device_initcall(ioapic_init_sysfs);
 
 #define IO_APIC_MAX_ID		0xFE
 
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
-{
-	union IO_APIC_reg_00 reg_00;
-	static physid_mask_t apic_id_map;
-	unsigned long flags;
-	int i = 0;
-
-	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
-	 * supports up to 16 on one shared APIC bus.
-	 * 
-	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
-	 *      advantage of new APIC bus architecture.
-	 */
-
-	if (physids_empty(apic_id_map))
-		apic_id_map = phys_cpu_present_map;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(ioapic, 0);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	if (apic_id >= IO_APIC_MAX_ID) {
-		apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
-			"%d\n", ioapic, apic_id, reg_00.bits.ID);
-		apic_id = reg_00.bits.ID;
-	}
-
-	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice 
-	 * 'stuck on smp_invalidate_needed IPI wait' messages.
-	 */
-	if (physid_isset(apic_id, apic_id_map)) {
-
-		for (i = 0; i < IO_APIC_MAX_ID; i++) {
-			if (!physid_isset(i, apic_id_map))
-				break;
-		}
-
-		if (i == IO_APIC_MAX_ID)
-			panic("Max apic_id exceeded!\n");
-
-		apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
-			"trying %d\n", ioapic, apic_id, i);
-
-		apic_id = i;
-	} 
-
-	physid_set(apic_id, apic_id_map);
-
-	if (reg_00.bits.ID != apic_id) {
-		reg_00.bits.ID = apic_id;
-
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic, 0, reg_00.raw);
-		reg_00.raw = io_apic_read(ioapic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		/* Sanity check */
-		if (reg_00.bits.ID != apic_id)
-			panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
-	return apic_id;
-}
-
-
 int __init io_apic_get_version (int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 7ec031c6ca1..f221231cd0a 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -759,7 +759,7 @@ void __init mp_register_ioapic (
 	mp_ioapics[idx].mpc_apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+	mp_ioapics[idx].mpc_apicid = id;
 	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
 	
 	/* 
-- 
cgit v1.2.3


From 312df5f1a1da780e084b328bcabb02a6dcd044c3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:28 -0700
Subject: [PATCH] x86_64: Add pmtimer support

There are unfortunately more and more multi processor Opteron systems which
don't have HPET timer support in the southbridge.  This covers in particular
Nvidia and VIA chipsets.  They also don't guarantee that the TSCs are
synchronized between CPUs; and especially with MP powernow the systems are
nearly unusable because the time gets very inconsistent between CPUs.

The timer code for x86-64 was originally written under the assumption that we
could fall back to the HPET timer on such systems.  But this doesn't work
there.

Another alternative is to use the ACPI PM timer as primary time source.  This
patch does that.  The kernel only uses PM timer when there is no other choice
because it has some disadvantages.

Ported over from i386.  It should be faster than the i386 version because I
dropped the "read three times" workaround, but is still considerable slower
than HPET and also does not work together with vsyscalls which have to be
disabled.

Cc: <mark.langsdorf@amd.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile   |   1 +
 arch/x86_64/kernel/pmtimer.c  | 101 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/time.c     |  62 +++++++++++++++++++-------
 arch/x86_64/kernel/vsyscall.c |   3 +-
 4 files changed, 148 insertions(+), 19 deletions(-)
 create mode 100644 arch/x86_64/kernel/pmtimer.c

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 0a3318e08ab..5ca4a4598fd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
 obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
new file mode 100644
index 00000000000..feb5f108dd2
--- /dev/null
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -0,0 +1,101 @@
+/* Ported over from i386 by AK, original copyright was:
+ *
+ * (C) Dominik Brodowski <linux@brodo.de> 2003
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ *
+ * Dropped all the hardware bug workarounds for now. Hopefully they
+ * are not needed on 64bit chipsets.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/msr.h>
+#include <asm/vsyscall.h>
+
+/* The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/kernel/acpi/boot.c */
+u32 pmtmr_ioport;
+
+/* value of the Power timer at last timer interrupt */
+static u32 offset_delay;
+static u32 last_pmtmr_tick;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 cyc2us(u32 cycles)
+{
+	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
+	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
+	 *
+	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
+	 * easily be multiplied with 286 (=0x11E) without having to fear
+	 * u32 overflows.
+	 */
+	cycles *= 286;
+	return (cycles >> 10);
+}
+
+int pmtimer_mark_offset(void)
+{
+	static int first_run = 1;
+	unsigned long tsc;
+	u32 lost;
+
+	u32 tick = inl(pmtmr_ioport);
+	u32 delta;
+
+	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
+
+	last_pmtmr_tick = tick;
+	monotonic_base += delta * NSEC_PER_USEC;
+
+	delta += offset_delay;
+
+	lost = delta / (USEC_PER_SEC / HZ);
+	offset_delay = delta % (USEC_PER_SEC / HZ);
+
+	rdtscll(tsc);
+	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
+
+	/* don't calculate delay for first run,
+	   or if we've got less then a tick */
+	if (first_run || (lost < 1)) {
+		first_run = 0;
+		offset_delay = 0;
+	}
+
+	return lost - 1;
+}
+
+unsigned int do_gettimeoffset_pm(void)
+{
+	u32 now, offset, delta = 0;
+
+	offset = last_pmtmr_tick;
+	now = inl(pmtmr_ioport);
+	delta = (now - offset) & ACPI_PM_MASK;
+
+	return offset_delay + cyc2us(delta);
+}
+
+
+static int __init nopmtimer_setup(char *s)
+{
+	pmtmr_ioport = 0;
+	return 0;
+}
+
+__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 0e3b60e1ac5..735b6767c8e 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -26,6 +26,8 @@
 #include <linux/sysdev.h>
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
+#include <linux/acpi.h>
+#include <acpi/achware.h>	/* for PM timer frequency */
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
 #include <asm/vsyscall.h>
@@ -396,6 +398,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
 
 		vxtime.last = offset;
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (vxtime.mode == VXTIME_PMTMR) {
+		lost = pmtimer_mark_offset();
+#endif
 	} else {
 		offset = (((tsc - vxtime.last_tsc) *
 			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
@@ -898,6 +904,13 @@ void __init time_init(void)
 			hpet_period;
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
+#ifdef CONFIG_X86_PM_TIMER
+	} else if (pmtmr_ioport) {
+		vxtime_hz = PM_TIMER_FREQUENCY;
+		timename = "PM";
+		pit_init();
+		cpu_khz = pit_calibrate_tsc();
+#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
@@ -922,6 +935,27 @@ void __init time_init(void)
 #endif
 }
 
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+#ifdef CONFIG_SMP
+	if (oem_force_hpet_timer())
+		return 1;
+ 	/* Intel systems are normally all synchronized. Exceptions
+ 	   are handled in the OEM check above. */
+ 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+ 	/* All in a single socket - should be synchronized */
+ 	if (cpus_weight(cpu_core_map[0]) == num_online_cpus())
+ 		return 0;
+#endif
+ 	/* Assume multi socket systems are not synchronized */
+ 	return num_online_cpus() > 1;
+}
+
 /*
  * Decide after all CPUs are booted what mode gettimeofday should use.
  */
@@ -929,29 +963,23 @@ void __init time_init_gtod(void)
 {
 	char *timetype;
 
-	/*
-	 * AMD systems with more than one CPU don't have fully synchronized
-	 * TSCs. Always use HPET gettimeofday for these, although it is slower.
-	 * Intel SMP systems usually have synchronized TSCs, so use always
-	 * the TSC.
-	 *
-	 * Exceptions:
-	 * IBM Summit2 checked by oem_force_hpet_timer().
- 	 * AMD dual core may also not need HPET. Check me.
-	 *
-	 * Can be turned off with "notsc".
-	 */
-	if (num_online_cpus() > 1 &&
-	    boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		notsc = 1;
-	/* Some systems will want to disable TSC and use HPET. */
-	if (oem_force_hpet_timer())
+	if (unsynchronized_tsc())
 		notsc = 1;
 	if (vxtime.hpet_address && notsc) {
 		timetype = "HPET";
 		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
 		vxtime.mode = VXTIME_HPET;
 		do_gettimeoffset = do_gettimeoffset_hpet;
+#ifdef CONFIG_X86_PM_TIMER
+	/* Using PM for gettimeofday is quite slow, but we have no other
+	   choice because the TSC is too unreliable on some systems. */
+	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
+		timetype = "PM";
+		do_gettimeoffset = do_gettimeoffset_pm;
+		vxtime.mode = VXTIME_PMTMR;
+		sysctl_vsyscall = 0;
+		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
+#endif
 	} else {
 		timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
 		vxtime.mode = VXTIME_TSC;
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b4b8dc59663..1a7541435ef 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -65,7 +65,7 @@ static force_inline void do_vgettimeofday(struct timeval * tv)
 		usec = (__xtime.tv_nsec / 1000) +
 			(__jiffies - __wall_jiffies) * (1000000 / HZ);
 
-		if (__vxtime.mode == VXTIME_TSC) {
+		if (__vxtime.mode != VXTIME_HPET) {
 			sync_core();
 			rdtscll(t);
 			if (t < __vxtime.last_tsc)
@@ -217,7 +217,6 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
-	sysctl_vsyscall = 1;
 	register_sysctl_table(kernel_root_table2, 0);
 	return 0;
 }
-- 
cgit v1.2.3


From d1099e8a18960693c04507bdd7b9403db70bfd97 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:29 -0700
Subject: [PATCH] x86_64: check if ptrace RIP is canonical

This works around an AMD Erratum.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/ptrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index e26e86bb56f..892212161aa 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -277,6 +277,11 @@ static int putreg(struct task_struct *child,
 				return -EIO;
 			value &= 0xffff;
 			break;
+		case offsetof(struct user_regs_struct, rip):
+			/* Check if the new RIP address is canonical */
+			if (value >= TASK_SIZE)
+				return -EIO;
+			break;
 	}
 	put_stack_long(child, regno - sizeof(struct pt_regs), value);
 	return 0;
-- 
cgit v1.2.3


From f6b8d4778c04148729cc0b0dcd335a4411c44276 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:30 -0700
Subject: [PATCH] x86_64: Fix canonical checking for segment registers in
 ptrace

Allowed user programs to set a non canonical segment base, which would cause
oopses in the kernel later.

Credit-to: Alexander Nyberg <alexn@dsv.su.se>

 For identifying and reporting this bug.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/ptrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 892212161aa..60dc9b98951 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,13 +257,13 @@ static int putreg(struct task_struct *child,
 			value &= 0xffff;
 			return 0;
 		case offsetof(struct user_regs_struct,fs_base):
-			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
-				return -EIO; 
+			if (value >= TASK_SIZE)
+				return -EIO;
 			child->thread.fs = value;
 			return 0;
 		case offsetof(struct user_regs_struct,gs_base):
-			if (!((value >> 48) == 0 || (value >> 48) == 0xffff))
-				return -EIO; 
+			if (value >= TASK_SIZE)
+				return -EIO;
 			child->thread.gs = value;
 			return 0;
 		case offsetof(struct user_regs_struct, eflags):
-- 
cgit v1.2.3


From 376ec33fcb246fdd7de1696d33739a0fd36f6537 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 16 May 2005 21:53:32 -0700
Subject: [PATCH] x86_64: Fix OEM hpet check

Use bitmap_zero instead of bitmap_empty to initialise cpu mask This makes it
actually run reliable instead of relying on stack state.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 7e13545748e..7f0b44e4a6e 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -925,7 +925,7 @@ __init int oem_force_hpet_timer(void)
 	unsigned id;
 	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
-	bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
+	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
 	for (i = 0; i < NR_CPUS; i++) {
 		id = bios_cpu_apicid[i];
-- 
cgit v1.2.3


From f3c5f5e7eeaf7c68ecc1d37200cd1ade0b3da7b9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:33 -0700
Subject: [PATCH] x86_64: Make vsyscall.c compile without CONFIG_SYSCTL

Originally from Matt Tolentino

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/vsyscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 1a7541435ef..2e573442594 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -217,7 +217,9 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
+#ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
+#endif
 	return 0;
 }
 
-- 
cgit v1.2.3


From 751521149a05e308d863d01ced61080ce1a2ec99 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:34 -0700
Subject: [PATCH] x86_64: Collected NMI watchdog fixes.

Collected NMI watchdog fixes.

- Fix call of check_nmi_watchdog

- Remove earlier move of check_nmi_watchdog to later.  It does not fix the
  race it was supposed to fix fully.

- Remove unused P6 definitions

- Add support for performance counter based watchdog on P4 systems.

  This allows to run it only once per second, which saves some CPU time.
  Previously it would run at 1000Hz, which was too much.

  Code ported from i386

  Make this the default on Intel systems.

- Use check_nmi_watchdog with local APIC based nmi

- Fix race in touch_nmi_watchdog

- Fix bug that caused incorrect performance counters to be programmed in a
  few cases on K8.

- Remove useless check for local APIC

- Use local_t and per_cpu variables for per CPU data.

- Keep other CPUs busy during check_nmi_watchdog to make sure they really
  tick when in lapic mode.

- Only check CPUs that are actually online.

- Various other fixes.

- Fix fallback path when MSRs are unimplemented

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c    |   3 +-
 arch/x86_64/kernel/nmi.c     | 250 +++++++++++++++++++++++++++++--------------
 arch/x86_64/kernel/smpboot.c |   3 +
 3 files changed, 177 insertions(+), 79 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 7f0b44e4a6e..f8e6cc4fecd 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -33,6 +33,7 @@
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/mach_apic.h>
+#include <asm/nmi.h>
 
 int apic_verbosity;
 
@@ -1056,7 +1057,7 @@ int __init APIC_init_uniprocessor (void)
 		nr_ioapics = 0;
 #endif
 	setup_boot_APIC_clock();
-
+	check_nmi_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index ec13eb97e8e..31c0f2e6ac9 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -33,6 +33,7 @@
 #include <asm/msr.h>
 #include <asm/proto.h>
 #include <asm/kdebug.h>
+#include <asm/local.h>
 
 /*
  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -59,7 +60,8 @@ int panic_on_timeout;
 
 unsigned int nmi_watchdog = NMI_DEFAULT;
 static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
+static unsigned int nmi_p4_cccr_val;
 
 /* Note that these events don't tick when the CPU idles. This means
    the frequency varies with CPU load. */
@@ -71,67 +73,87 @@ unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
 
-#define P6_EVNTSEL0_ENABLE	(1 << 22)
-#define P6_EVNTSEL_INT		(1 << 20)
-#define P6_EVNTSEL_OS		(1 << 17)
-#define P6_EVNTSEL_USR		(1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED	0x79
-#define P6_NMI_EVENT		P6_EVENT_CPU_CLOCKS_NOT_HALTED
+#define MSR_P4_MISC_ENABLE	0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12)
+#define MSR_P4_PERFCTR0		0x300
+#define MSR_P4_CCCR0		0x360
+#define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
+#define P4_ESCR_OS		(1<<3)
+#define P4_ESCR_USR		(1<<2)
+#define P4_CCCR_OVF_PMI0	(1<<26)
+#define P4_CCCR_OVF_PMI1	(1<<27)
+#define P4_CCCR_THRESHOLD(N)	((N)<<20)
+#define P4_CCCR_COMPLEMENT	(1<<19)
+#define P4_CCCR_COMPARE		(1<<18)
+#define P4_CCCR_REQUIRED	(3<<16)
+#define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
+#define P4_CCCR_ENABLE		(1<<12)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+   CRU_ESCR0 (with any non-null event selector) through a complemented
+   max threshold. [IA32-Vol3, Section 14.9.9] */
+#define MSR_P4_IQ_COUNTER0	0x30C
+#define P4_NMI_CRU_ESCR0	(P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
+#define P4_NMI_IQ_CCCR0	\
+	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
+	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+
+static __init inline int nmi_known_cpu(void)
+{
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_AMD:
+		return boot_cpu_data.x86 == 15;
+	case X86_VENDOR_INTEL:
+		return boot_cpu_data.x86 == 15;
+	}
+	return 0;
+}
 
 /* Run after command line and cpu_init init, but before all other checks */
 void __init nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
-
-	/* For some reason the IO APIC watchdog doesn't work on the AMD
-	   8111 chipset. For now switch to local APIC mode using
-	   perfctr0 there.  On Intel CPUs we don't have code to handle
-	   the perfctr and the IO-APIC seems to work, so use that.  */
-
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
-		nmi_watchdog = NMI_LOCAL_APIC; 
-		printk(KERN_INFO 
-              "Using local APIC NMI watchdog using perfctr0\n");
-	} else {
-		printk(KERN_INFO "Using IO APIC NMI watchdog\n");
+	if (nmi_known_cpu())
+		nmi_watchdog = NMI_LOCAL_APIC;
+	else
 		nmi_watchdog = NMI_IO_APIC;
-	}
 }
 
-/* Why is there no CPUID flag for this? */
-static __init int cpu_has_lapic(void)
+#ifdef CONFIG_SMP
+/* The performance counters used by NMI_LOCAL_APIC don't trigger when
+ * the CPU is idle. To make sure the NMI watchdog really ticks on all
+ * CPUs during the test make them busy.
+ */
+static __init void nmi_cpu_busy(void *data)
 {
-	switch (boot_cpu_data.x86_vendor) { 
-	case X86_VENDOR_INTEL:
-	case X86_VENDOR_AMD: 
-		return boot_cpu_data.x86 >= 6; 
-	/* .... add more cpus here or find a different way to figure this out. */	
-	default:
-		return 0;
-	} 	
+	volatile int *endflag = data;
+	local_irq_enable();
+	/* Intentionally don't use cpu_relax here. This is
+	   to make sure that the performance counter really ticks,
+	   even if there is a simulator or similar that catches the
+	   pause instruction. On a real HT machine this is fine because
+	   all other CPUs are busy with "useless" delay loops and don't
+	   care if they get somewhat less cycles. */
+	while (*endflag == 0)
+		barrier();
 }
+#endif
 
-static int __init check_nmi_watchdog (void)
+int __init check_nmi_watchdog (void)
 {
+	volatile int endflag = 0;
 	int *counts;
 	int cpu;
 
-	if (nmi_watchdog == NMI_NONE)
-		return 0;
+	counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
+	if (!counts)
+		return -1;
 
-	if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic())  {
-		nmi_watchdog = NMI_NONE;
-		return -1; 
-	}	
+	printk(KERN_INFO "testing NMI watchdog ... ");
 
-	counts = kmalloc(NR_CPUS * sizeof(int),GFP_KERNEL);
-	if (!counts) {
-		nmi_watchdog = NMI_NONE;
-		return 0;
-	}
-
-	printk(KERN_INFO "Testing NMI watchdog ... ");
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		counts[cpu] = cpu_pda[cpu].__nmi_count; 
@@ -139,16 +161,22 @@ static int __init check_nmi_watchdog (void)
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_online(cpu))
+			continue;
 		if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
-			printk("CPU#%d: NMI appears to be stuck (%d)!\n", 
+			endflag = 1;
+			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 			       cpu,
+			       counts[cpu],
 			       cpu_pda[cpu].__nmi_count);
 			nmi_active = 0;
 			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
+			nmi_perfctr_msr = 0;
 			kfree(counts);
 			return -1;
 		}
 	}
+	endflag = 1;
 	printk("OK.\n");
 
 	/* now that we know it works we can reduce NMI frequency to
@@ -159,8 +187,6 @@ static int __init check_nmi_watchdog (void)
 	kfree(counts);
 	return 0;
 }
-/* Have this called later during boot so counters are updating */
-late_initcall(check_nmi_watchdog);
 
 int __init setup_nmi_watchdog(char *str)
 {
@@ -178,7 +204,7 @@ int __init setup_nmi_watchdog(char *str)
 
 	if (nmi >= NMI_INVALID)
 		return 0;
-		nmi_watchdog = nmi;
+	nmi_watchdog = nmi;
 	return 1;
 }
 
@@ -193,7 +219,10 @@ static void disable_lapic_nmi_watchdog(void)
 		wrmsr(MSR_K7_EVNTSEL0, 0, 0);
 		break;
 	case X86_VENDOR_INTEL:
-		wrmsr(MSR_IA32_EVNTSEL0, 0, 0);
+		if (boot_cpu_data.x86 == 15) {
+			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
+			wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+		}
 		break;
 	}
 	nmi_active = -1;
@@ -261,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
 
 static int nmi_pm_active; /* nmi_active before suspend */
 
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
 {
 	nmi_pm_active = nmi_active;
 	disable_lapic_nmi_watchdog();
@@ -308,22 +337,27 @@ late_initcall(init_lapic_nmi_sysfs);
  * Original code written by Keith Owens.
  */
 
+static void clear_msr_range(unsigned int base, unsigned int n)
+{
+	unsigned int i;
+
+	for(i = 0; i < n; ++i)
+		wrmsr(base+i, 0, 0);
+}
+
 static void setup_k7_watchdog(void)
 {
 	int i;
 	unsigned int evntsel;
 
-	/* No check, so can start with slow frequency */
-	nmi_hz = 1; 
-
-	/* XXX should check these in EFER */
-
 	nmi_perfctr_msr = MSR_K7_PERFCTR0;
 
 	for(i = 0; i < 4; ++i) {
 		/* Simulator may not support it */
-		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL))
+		if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
+			nmi_perfctr_msr = 0;
 			return;
+		}
 		wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
 	}
 
@@ -333,12 +367,54 @@ static void setup_k7_watchdog(void)
 		| K7_NMI_EVENT;
 
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-	wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz);
+	wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= K7_EVNTSEL_ENABLE;
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
 }
 
+
+static int setup_p4_watchdog(void)
+{
+	unsigned int misc_enable, dummy;
+
+	rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+		return 0;
+
+	nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+	nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2)
+		nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+#endif
+
+	if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
+		clear_msr_range(0x3F1, 2);
+	/* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+	   docs doesn't fully define it, so leave it alone for now. */
+	if (boot_cpu_data.x86_model >= 0x3) {
+		/* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
+		clear_msr_range(0x3A0, 26);
+		clear_msr_range(0x3BC, 3);
+	} else {
+		clear_msr_range(0x3A0, 31);
+	}
+	clear_msr_range(0x3C0, 6);
+	clear_msr_range(0x3C8, 6);
+	clear_msr_range(0x3E0, 2);
+	clear_msr_range(MSR_P4_CCCR0, 18);
+	clear_msr_range(MSR_P4_PERFCTR0, 18);
+
+	wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
+	wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
+	Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
+	wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+	return 1;
+}
+
 void setup_apic_nmi_watchdog(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
@@ -349,6 +425,13 @@ void setup_apic_nmi_watchdog(void)
 			return;
 		setup_k7_watchdog();
 		break;
+	case X86_VENDOR_INTEL:
+		if (boot_cpu_data.x86 != 15)
+			return;
+		if (!setup_p4_watchdog())
+			return;
+		break;
+
 	default:
 		return;
 	}
@@ -363,56 +446,67 @@ void setup_apic_nmi_watchdog(void)
  *
  * as these watchdog NMI IRQs are generated on every CPU, we only
  * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- *  here too!]
  */
 
-static unsigned int
-	last_irq_sums [NR_CPUS],
-	alert_counter [NR_CPUS];
+static DEFINE_PER_CPU(unsigned, last_irq_sum);
+static DEFINE_PER_CPU(local_t, alert_counter);
+static DEFINE_PER_CPU(int, nmi_touch);
 
 void touch_nmi_watchdog (void)
 {
 	int i;
 
 	/*
-	 * Just reset the alert counters, (other CPUs might be
-	 * spinning on locks we hold):
+ 	 * Tell other CPUs to reset their alert counters. We cannot
+	 * do it ourselves because the alert count increase is not
+	 * atomic.
 	 */
 	for (i = 0; i < NR_CPUS; i++)
-		alert_counter[i] = 0;
+		per_cpu(nmi_touch, i) = 1;
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
 {
-	int sum, cpu;
+	int sum;
+	int touched = 0;
 
-	cpu = safe_smp_processor_id();
 	sum = read_pda(apic_timer_irqs);
-	if (last_irq_sums[cpu] == sum) {
+	if (__get_cpu_var(nmi_touch)) {
+		__get_cpu_var(nmi_touch) = 0;
+		touched = 1;
+	}
+	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
-		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz) {
+		local_inc(&__get_cpu_var(alert_counter));
+		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
-				alert_counter[cpu] = 0; 
+				local_set(&__get_cpu_var(alert_counter), 0);
 				return;
 			} 
 			die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs);
 		}
 	} else {
-		last_irq_sums[cpu] = sum;
-		alert_counter[cpu] = 0;
+		__get_cpu_var(last_irq_sum) = sum;
+		local_set(&__get_cpu_var(alert_counter), 0);
 	}
-	if (nmi_perfctr_msr)
+	if (nmi_perfctr_msr) {
+ 		if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
+ 			/*
+ 			 * P4 quirks:
+ 			 * - An overflown perfctr will assert its interrupt
+ 			 *   until the OVF flag in its CCCR is cleared.
+ 			 * - LVTPC is masked on interrupt and must be
+ 			 *   unmasked by the LVTPC handler.
+ 			 */
+ 			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ 			apic_write(APIC_LVTPC, APIC_DM_NMI);
+ 		}
 		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+	}
 }
 
 static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 17efc40baa3..d00e494c1a3 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -56,6 +56,7 @@
 #include <asm/kdebug.h>
 #include <asm/tlbflush.h>
 #include <asm/proto.h>
+#include <asm/nmi.h>
 
 /* Change for real CPU hotplug. Note other files need to be fixed
    first too. */
@@ -1030,4 +1031,6 @@ void __cpuinit smp_cpus_done(unsigned int max_cpus)
 
 	detect_siblings();
 	time_init_gtod();
+
+	check_nmi_watchdog();
 }
-- 
cgit v1.2.3


From 18a2b6471277c9ffbaa14a0c87a11a075eb354bd Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 16 May 2005 21:53:35 -0700
Subject: [PATCH] x86_64: Don't assume BSP has ID 0 in new smp bootup

This patch removes the assumption that LAPIC entries contain the BSP as its
first entry.  This is a slight improvement to the temporary fix submitted by
Suresh Siddha.

- Removes assumption that LAPIC entries contain BSP first.

- Builds x86_acpiid_to_apicid[] and bios_cpu_apicid[] properly with BSP as
  first entry.

- Made maxcpus=1 boot on these systems.  Since the parsing earlier in
  arch/x86_64/kernel/mpparse.c stopped after maxcpus entries, other entries
  were not processed, this causes kernel not to boot on these systems.

TBD: x86_acpiid_to_apicid and bios_cpu_apicid[] seem to be exactly the
     same.  This could be removed, but might need more work to cleanup.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c | 20 ++++++++++++++------
 arch/x86_64/kernel/smpboot.c |  2 --
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f221231cd0a..f86d9db94bf 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -107,6 +107,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
 	int ver;
+	static int found_bsp=0;
 
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
 		return;
@@ -126,11 +127,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 			" Processor ignored.\n", NR_CPUS);
 		return;
 	}
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
-		return;
-	}
 
 	num_processors++;
 
@@ -150,7 +146,19 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
-	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+ 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+ 		/*
+ 		 * bios_cpu_apicid is required to have processors listed
+ 		 * in same order as logical cpu numbers. Hence the first
+ 		 * entry is BSP, and so on.
+ 		 */
+ 		bios_cpu_apicid[0] = m->mpc_apicid;
+ 		x86_cpu_to_apicid[0] = m->mpc_apicid;
+ 		found_bsp = 1;
+ 	} else {
+ 		bios_cpu_apicid[num_processors - found_bsp] = m->mpc_apicid;
+ 		x86_cpu_to_apicid[num_processors - found_bsp] = m->mpc_apicid;
+ 	}
 }
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index d00e494c1a3..5abdee1e16a 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -645,7 +645,6 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
 		printk("failed fork for CPU %d\n", cpu);
 		return PTR_ERR(idle);
 	}
-	x86_cpu_to_apicid[cpu] = apicid;
 
 	cpu_pda[cpu].pcurrent = idle;
 
@@ -954,7 +953,6 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
 		      GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
 		/* Or can we switch back to PIC here? */
 	}
-	x86_cpu_to_apicid[0] = boot_cpu_id;
 
 	/*
 	 * Now start the IO-APICs
-- 
cgit v1.2.3


From b41e29398a873945d02e0009ce7e57608fdb4042 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 May 2005 14:27:55 -0700
Subject: [PATCH] x86_64: 386/x86-64 Further AMD dual core fixes

- Remove duplicated ifdef
- Make core_id match what Intel uses
- Initialize phys_proc_id correctly for non DC case
- Handle non power of two core numbers.

Fixes for both i386 and x86-64

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index b9fd0252c27..99f038ede23 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -719,7 +719,6 @@ static void __init display_cacheinfo(struct cpuinfo_x86 *c)
 	}
 }
 
-#ifdef CONFIG_SMP
 /*
  * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
  * Assumes number of cores is a power of two.
@@ -729,16 +728,24 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 #ifdef CONFIG_SMP
 	int cpu = smp_processor_id();
 	int node = 0;
+	unsigned bits;
 	if (c->x86_num_cores == 1)
 		return;
-	/* Fix up the APIC ID following the AMD specification. */
- 	cpu_core_id[cpu] >>= hweight32(c->x86_num_cores - 1);
+
+	bits = 0;
+	while ((1 << bits) < c->x86_num_cores)
+		bits++;
+
+	/* Low order bits define the core id (index of core in socket) */
+	cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
+	/* Convert the APIC ID into the socket ID */
+	phys_proc_id[cpu] >>= bits;
 
 #ifdef CONFIG_NUMA
 	/* When an ACPI SRAT table is available use the mappings from SRAT
  	   instead. */
 	if (acpi_numa <= 0) {
-		node = cpu_core_id[cpu];
+		node = phys_proc_id[cpu];
 		if (!node_online(node))
 			node = first_node(node_online_map);
 		cpu_to_node[cpu] = node;
@@ -746,18 +753,11 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
 		node = cpu_to_node[cpu];
 	}
 #endif
-	/* For now: - better than BAD_APIC_ID at least*/
-	phys_proc_id[cpu] = cpu_core_id[cpu];
 
 	printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n",
 			cpu, c->x86_num_cores, node, cpu_core_id[cpu]);
 #endif
 }
-#else
-static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-}
-#endif
 
 static int __init init_amd(struct cpuinfo_x86 *c)
 {
@@ -963,8 +963,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 #ifdef CONFIG_SMP
-	phys_proc_id[smp_processor_id()] =
-	cpu_core_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+	phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
 #endif
 }
 
-- 
cgit v1.2.3


From c4d1fcf3a2ea89b6d6221fa8b4588c77aff50995 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 May 2005 14:27:56 -0700
Subject: [PATCH] x86_64: Don't allow accesses below register frame in ptrace

There was a "off by one quad word" error in there.  I don't think it is
exploitable because it will only store into a unused area, but better to plug
it.

Found and fixed by John Blackwood

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/ptrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 60dc9b98951..525f6a128a2 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -380,7 +380,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 			break;
 
 		switch (addr) { 
-		case 0 ... sizeof(struct user_regs_struct):
+		case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
 			tmp = getreg(child, addr);
 			break;
 		case offsetof(struct user, u_debugreg[0]):
@@ -425,7 +425,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 			break;
 
 		switch (addr) { 
-		case 0 ... sizeof(struct user_regs_struct): 
+		case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
 			ret = putreg(child, addr, data);
 			break;
 		/* Disallows to set a breakpoint into the vsyscall */
-- 
cgit v1.2.3


From 607a16858397829806c5a4db999ce6daf327f98c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 May 2005 14:27:58 -0700
Subject: [PATCH] x86_64: Fix 32bit system call restart

The test case at
http://cvs.sourceforge.net/viewcvs.py/posixtest/posixtestsuite/conforman
ce/interfaces/clock_nanosleep/1-5.c fails if it runs as a 32bit process on
x86_86 machines.

The root cause is the sub 32bit process fails to restart the syscall after it
is interrupted by a signal.

The syscall number of sys_restart_syscall in table sys_call_table is
__NR_restart_syscall (219) while it's __NR_ia32_restart_syscall
(0) in ia32_sys_call_table. When regs->rax==(unsigned
long)-ERESTART_RESTARTBLOCK, function do_signal doesn't distinguish if
the process is 64bit or 32bit, and always sets restart syscall number
as __NR_restart_syscall (219).

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index d439ced150c..3fdcdba0fec 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -452,7 +452,9 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 			regs->rip -= 2;
 		}
 		if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
-			regs->rax = __NR_restart_syscall;
+			regs->rax = test_thread_flag(TIF_IA32) ?
+					__NR_ia32_restart_syscall :
+					__NR_restart_syscall;
 			regs->rip -= 2;
 		}
 	}
-- 
cgit v1.2.3


From 14d98cad82b78956957e95567b8b5fb38ec5859f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 May 2005 14:27:59 -0700
Subject: [PATCH] x86_64: Add option to disable timer check

This works around the too fast timer seen on some ATI boards.

I don't feel confident enough about it yet to enable it by default, but give
users the option.

Patch and debugging from Christopher Allen Wing <wingc@engin.umich.edu>, with
minor tweaks (renamed the option and documented it)

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index ac768432495..80e9b498c44 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -42,6 +42,8 @@
 
 int sis_apic_bug; /* not actually supported, dummy for compile */
 
+static int no_timer_check;
+
 static DEFINE_SPINLOCK(ioapic_lock);
 
 /*
@@ -1601,7 +1603,7 @@ static inline void check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		unmask_IO_APIC_irq(0);
-		if (timer_irq_works()) {
+		if (!no_timer_check && timer_irq_works()) {
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
@@ -1671,6 +1673,13 @@ static inline void check_timer(void)
 	panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
 }
 
+static int __init notimercheck(char *s)
+{
+	no_timer_check = 1;
+	return 1;
+}
+__setup("no_timer_check", notimercheck);
+
 /*
  *
  * IRQ's that are handled by the PIC in the MPS IOAPIC case.
-- 
cgit v1.2.3


From 2df9fa36643f03b0462b170515c1e221c3709fa9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Fri, 20 May 2005 14:27:59 -0700
Subject: [PATCH] x86_64: i386/x86-64: Export cpu_core_map

Needed for the powernow k8 driver for dual core support.

Signed-off-by: Andi Kleen <ak@suse.de>
Cc: <mark.langsdorf@amd.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 5abdee1e16a..f1ec0f34594 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -94,6 +94,7 @@ int smp_threads_ready;
 
 cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_core_map);
 
 /*
  * Trampoline 80x86 program as an array.
-- 
cgit v1.2.3


From 4f60fdf613536402e3313c5019b3ae6060b2ae4a Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@telia.com>
Date: Wed, 25 May 2005 12:31:28 -0700
Subject: [PATCH] x86_64: CONFIG_BUG=n fixes

Fixes some !CONFIG_BUG warnings:
include/asm/mmu_context.h: I funktion `switch_mm':
include/asm/mmu_context.h:57: varning: implicit declaration of function `out_of_line_bug'

Signed-off-by: Alexander Nyberg <alexn@telia.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/traps.c       | 2 ++
 arch/x86_64/kernel/x8664_ksyms.c | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 65a37f52c56..3dfec8fdabc 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -332,10 +332,12 @@ void handle_BUG(struct pt_regs *regs)
 	printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line);
 } 
 
+#ifdef CONFIG_BUG
 void out_of_line_bug(void)
 { 
 	BUG(); 
 } 
+#endif
 
 static DEFINE_SPINLOCK(die_lock);
 static int die_owner = -1;
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index a43dedb58fa..68ec03070e5 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -193,8 +193,9 @@ EXPORT_SYMBOL(smp_num_siblings);
 extern void do_softirq_thunk(void);
 EXPORT_SYMBOL(do_softirq_thunk);
 
-void out_of_line_bug(void);
+#ifdef CONFIG_BUG
 EXPORT_SYMBOL(out_of_line_bug);
+#endif
 
 EXPORT_SYMBOL(init_level4_pgt);
 
-- 
cgit v1.2.3


From c5924b7d97430657fa07ebf1a208c7e8cf135ef3 Mon Sep 17 00:00:00 2001
From: Oliver Korpilla <Oliver.Korpilla@gmx.de>
Date: Fri, 27 May 2005 12:52:55 -0700
Subject: [PATCH] x86_64: signal.c build fix

For unspecified reasons, arch/x86_64/kernel/signal.c apparently needs
ia32_unistd.h.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 3fdcdba0fec..429c0269dc4 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -28,6 +28,7 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/proto.h>
+#include <asm/ia32_unistd.h>
 
 /* #define DEBUG_SIG 1 */
 
-- 
cgit v1.2.3


From 8d91640606c47551771a9b27664f249f4b176613 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Tue, 31 May 2005 14:39:26 -0700
Subject: [PATCH] x86_64: More fixes for compilation without CONFIG_ACPI

Suggested by Alexander Nyberg

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/io_apic.c | 1 +
 arch/x86_64/kernel/mpparse.c | 1 +
 arch/x86_64/kernel/time.c    | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 80e9b498c44..afd87e64d0a 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -37,6 +37,7 @@
 #include <asm/desc.h>
 #include <asm/proto.h>
 #include <asm/mach_apic.h>
+#include <asm/acpi.h>
 
 #define __apicdebuginit  __init
 
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index f86d9db94bf..61a63be6b29 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -30,6 +30,7 @@
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
 #include <asm/proto.h>
+#include <asm/acpi.h>
 
 /* Have we found an MP table */
 int smp_found_config;
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 735b6767c8e..fb8c809b4cd 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -27,7 +27,9 @@
 #include <linux/bcd.h>
 #include <linux/kallsyms.h>
 #include <linux/acpi.h>
+#ifdef CONFIG_ACPI
 #include <acpi/achware.h>	/* for PM timer frequency */
+#endif
 #include <asm/8253pit.h>
 #include <asm/pgtable.h>
 #include <asm/vsyscall.h>
-- 
cgit v1.2.3


From 42442ed5744b03f5351a142649b8b4c97e6950ab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 8 Jun 2005 15:49:25 -0700
Subject: [PATCH] revert x86_64-use-the-e820-hole-to-map-the-iommu-agp-aperture

Martin Bligh determined that this patch is causing his test box to not boot.
Revert.

Andi Kleen <ak@muc.de>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index a491f72cc96..504e6347499 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -33,12 +33,10 @@ int fallback_aper_force __initdata = 0;
 
 int fix_aperture __initdata = 1;
 
-#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
+/* This code runs before the PCI subsystem is initialized, so just
+   access the northbridge directly. */
 
-static struct resource aper_res = {
-	.name = "Aperture",
-	.flags = IORESOURCE_MEM,
-};
+#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
 
 static u32 __init allocate_aperture(void) 
 {
@@ -55,24 +53,11 @@ static u32 __init allocate_aperture(void)
 	aper_size = (32 * 1024 * 1024) << fallback_aper_order; 
 
 	/* 
-	 * Aperture has to be naturally aligned. This means an 2GB
-	 * aperture won't have much chances to find a place in the
-	 * lower 4GB of memory.  Unfortunately we cannot move it up
-	 * because that would make the IOMMU useless.
+	 * Aperture has to be naturally aligned. This means an 2GB aperture won't
+	 * have much chances to find a place in the lower 4GB of memory.
+	 * Unfortunately we cannot move it up because that would make the
+	 * IOMMU useless.
 	 */
-
-	/* First try to find some free unused space */
-	if (!allocate_resource(&iomem_resource, &aper_res,
-			       aper_size,
-			       0, 0xffffffff,
-			       aper_size,
-			       NULL, NULL)) {
-		printk(KERN_INFO "Putting aperture at %lx-%lx\n",
-		       aper_res.start, aper_res.end);
-		return aper_res.start;
-	}
-
-	/* No free space found. Go on to waste some memory... */
 	p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); 
 	if (!p || __pa(p)+aper_size > 0xffffffff) {
 		printk("Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -81,7 +66,7 @@ static u32 __init allocate_aperture(void)
 			free_bootmem_node(nd0, (unsigned long)p, aper_size); 
 		return 0;
 	}
-	printk("Mapping aperture over %d KB of precious RAM @ %lx\n",
+	printk("Mapping aperture over %d KB of RAM @ %lx\n",
 	       aper_size >> 10, __pa(p)); 
 	return (u32)__pa(p); 
 }
@@ -102,16 +87,10 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
 		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
 		return 0; 
 	} 
-	/* Don't check the resource here because the aperture is usually
-	   in an e820 reserved area, and we allocated these earlier. */
 	return 1;
 } 
 
-/*
- * Find a PCI capability.
- * This code runs before the PCI subsystem is initialized, so just
- * access the northbridge directly.
- */
+/* Find a PCI capability */
 static __u32 __init find_cap(int num, int slot, int func, int cap) 
 { 
 	u8 pos;
@@ -276,6 +255,8 @@ void __init iommu_hole_init(void)
 		   fallback_aper_force) { 
 		printk("Your BIOS doesn't leave a aperture memory hole\n");
 		printk("Please enable the IOMMU option in the BIOS setup\n");
+		printk("This costs you %d MB of RAM\n",
+		       32 << fallback_aper_order);
 
 		aper_order = fallback_aper_order;
 		aper_alloc = allocate_aperture();
-- 
cgit v1.2.3


From 84929801e14d968caeb84795bfbb88f04283fbd9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Tue, 21 Jun 2005 17:14:32 -0700
Subject: [PATCH] x86_64: TASK_SIZE fixes for compatibility mode processes

Appended patch will setup compatibility mode TASK_SIZE properly.  This will
fix atleast three known bugs that can be encountered while running
compatibility mode apps.

a) A malicious 32bit app can have an elf section at 0xffffe000.  During
   exec of this app, we will have a memory leak as insert_vm_struct() is
   not checking for return value in syscall32_setup_pages() and thus not
   freeing the vma allocated for the vsyscall page.  And instead of exec
   failing (as it has addresses > TASK_SIZE), we were allowing it to
   succeed previously.

b) With a 32bit app, hugetlb_get_unmapped_area/arch_get_unmapped_area
   may return addresses beyond 32bits, ultimately causing corruption
   because of wrap-around and resulting in SEGFAULT, instead of returning
   ENOMEM.

c) 32bit app doing this below mmap will now fail.

  mmap((void *)(0xFFFFE000UL), 0x10000UL, PROT_READ|PROT_WRITE,
	MAP_FIXED|MAP_PRIVATE|MAP_ANON, 0, 0);

Signed-off-by: Zou Nan hai <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/process.c    |  4 ++--
 arch/x86_64/kernel/ptrace.c     | 17 ++++++++++-------
 arch/x86_64/kernel/sys_x86_64.c | 14 ++++----------
 3 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 761b6d35e33..dce8bab4306 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -656,7 +656,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
 	switch (code) { 
 	case ARCH_SET_GS:
-		if (addr >= TASK_SIZE) 
+		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM; 
 		cpu = get_cpu();
 		/* handle small bases via the GDT because that's faster to 
@@ -682,7 +682,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 	case ARCH_SET_FS:
 		/* Not strictly needed for fs, but do it for symmetry
 		   with gs */
-		if (addr >= TASK_SIZE)
+		if (addr >= TASK_SIZE_OF(task))
 			return -EPERM; 
 		cpu = get_cpu();
 		/* handle small bases via the GDT because that's faster to 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 525f6a128a2..bbf64b59a21 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -257,12 +257,12 @@ static int putreg(struct task_struct *child,
 			value &= 0xffff;
 			return 0;
 		case offsetof(struct user_regs_struct,fs_base):
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			child->thread.fs = value;
 			return 0;
 		case offsetof(struct user_regs_struct,gs_base):
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			child->thread.gs = value;
 			return 0;
@@ -279,7 +279,7 @@ static int putreg(struct task_struct *child,
 			break;
 		case offsetof(struct user_regs_struct, rip):
 			/* Check if the new RIP address is canonical */
-			if (value >= TASK_SIZE)
+			if (value >= TASK_SIZE_OF(child))
 				return -EIO;
 			break;
 	}
@@ -419,6 +419,8 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
+	{
+		int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
 		ret = -EIO;
 		if ((addr & 7) ||
 		    addr > sizeof(struct user) - 7)
@@ -430,22 +432,22 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 			break;
 		/* Disallows to set a breakpoint into the vsyscall */
 		case offsetof(struct user, u_debugreg[0]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg0 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[1]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg1 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[2]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg2 = data;
 			ret = 0;
 			break;
 		case offsetof(struct user, u_debugreg[3]):
-			if (data >= TASK_SIZE-7) break;
+			if (data >= TASK_SIZE_OF(child) - dsize) break;
 			child->thread.debugreg3 = data;
 			ret = 0;
 			break;
@@ -469,6 +471,7 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		  break;
 		}
 		break;
+	}
 	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
 	case PTRACE_CONT:    /* restart after signal. */
 
diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index dbebd5ccba6..d9798dd433f 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -68,13 +68,7 @@ out:
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32)) { 
-		*begin = TASK_UNMAPPED_32;
-		*end = IA32_PAGE_OFFSET; 
-	} else 
-#endif
-	if (flags & MAP_32BIT) { 
+	if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
 		/* This is usually used needed to map code in small
 		   model, so it needs to be in the first 31bit. Limit
 		   it to that.  This means we need to move the
@@ -84,10 +78,10 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		   of playground for now. -AK */ 
 		*begin = 0x40000000; 
 		*end = 0x80000000;		
-	} else { 
-		*begin = TASK_UNMAPPED_64; 
+	} else {
+		*begin = TASK_UNMAPPED_BASE;
 		*end = TASK_SIZE; 
-		}
+	}
 } 
 
 unsigned long
-- 
cgit v1.2.3


From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001
From: Wolfgang Wander <wwc@rentec.com>
Date: Tue, 21 Jun 2005 17:14:49 -0700
Subject: [PATCH] Avoiding mmap fragmentation

Ingo recently introduced a great speedup for allocating new mmaps using the
free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and
causes huge performance increases in thread creation.

The downside of this patch is that it does lead to fragmentation in the
mmap-ed areas (visible via /proc/self/maps), such that some applications
that work fine under 2.4 kernels quickly run out of memory on any 2.6
kernel.

The problem is twofold:

  1) the free_area_cache is used to continue a search for memory where
     the last search ended.  Before the change new areas were always
     searched from the base address on.

     So now new small areas are cluttering holes of all sizes
     throughout the whole mmap-able region whereas before small holes
     tended to close holes near the base leaving holes far from the base
     large and available for larger requests.

  2) the free_area_cache also is set to the location of the last
     munmap-ed area so in scenarios where we allocate e.g.  five regions of
     1K each, then free regions 4 2 3 in this order the next request for 1K
     will be placed in the position of the old region 3, whereas before we
     appended it to the still active region 1, placing it at the location
     of the old region 2.  Before we had 1 free region of 2K, now we only
     get two free regions of 1K -> fragmentation.

The patch addresses thes issues by introducing yet another cache descriptor
cached_hole_size that contains the largest known hole size below the
current free_area_cache.  If a new request comes in the size is compared
against the cached_hole_size and if the request can be filled with a hole
below free_area_cache the search is started from the base instead.

The results look promising: Whereas 2.6.12-rc4 fragments quickly and my
(earlier posted) leakme.c test program terminates after 50000+ iterations
with 96 distinct and fragmented maps in /proc/self/maps it performs nicely
(as expected) with thread creation, Ingo's test_str02 with 20000 threads
requires 0.7s system time.

Taking out Ingo's patch (un-patch available per request) by basically
deleting all mentions of free_area_cache from the kernel and starting the
search for new memory always at the respective bases we observe: leakme
terminates successfully with 11 distinctive hardly fragmented areas in
/proc/self/maps but thread creating is gringdingly slow: 30+s(!) system
time for Ingo's test_str02 with 20000 threads.

Now - drumroll ;-) the appended patch works fine with leakme: it ends with
only 7 distinct areas in /proc/self/maps and also thread creation seems
sufficiently fast with 0.71s for 20000 threads.

Signed-off-by: Wolfgang Wander <wwc@rentec.com>
Credit-to: "Richard Purdie" <rpurdie@rpsys.net>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu> (partly)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/sys_x86_64.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c
index d9798dd433f..cc7821c6885 100644
--- a/arch/x86_64/kernel/sys_x86_64.c
+++ b/arch/x86_64/kernel/sys_x86_64.c
@@ -105,6 +105,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
+	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
+	    && len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = begin;
+	}
 	addr = mm->free_area_cache;
 	if (addr < begin) 
 		addr = begin; 
@@ -120,6 +125,7 @@ full_search:
 			 */
 			if (start_addr != begin) {
 				start_addr = addr = begin;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -131,6 +137,9 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
 		addr = vma->vm_end;
 	}
 }
-- 
cgit v1.2.3


From 073326634bec2f36e165583d21fb0a9fad47ac0a Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:03 -0700
Subject: [PATCH] remove direct ref to contig_page_data for x86-64

This patch pulls out all remaining direct references to contig_page_data
from arch/x86-64, thus saving an ifdef in one case.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/aperture.c | 4 ----
 arch/x86_64/kernel/setup.c    | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 504e6347499..c9a6b812e92 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -40,11 +40,7 @@ int fix_aperture __initdata = 1;
 
 static u32 __init allocate_aperture(void) 
 {
-#ifdef CONFIG_DISCONTIGMEM
 	pg_data_t *nd0 = NODE_DATA(0);
-#else
-	pg_data_t *nd0 = &contig_page_data;
-#endif	
 	u32 aper_size;
 	void *p; 
 
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 99f038ede23..125a0bb1f5d 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -386,7 +386,7 @@ static void __init contig_initmem_init(void)
         if (bootmap == -1L) 
                 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
         bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-        e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); 
+        e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
         reserve_bootmem(bootmap, bootmap_size);
 } 
 #endif
-- 
cgit v1.2.3


From 2b97690f4cd960779fb351b7cd9974390afabb36 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:06 -0700
Subject: [PATCH] reorganize x86-64 NUMA and DISCONTIGMEM config options

In order to use the alternative sparsemem implmentation for NUMA kernels,
we need to reorganize the config options.  This patch effectively abstracts
out the CONFIG_DISCONTIGMEM options to CONFIG_NUMA in most cases.  Thus,
the discontigmem implementation may be employed as always, but the
sparsemem implementation may be used alternatively.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head64.c | 2 +-
 arch/x86_64/kernel/setup.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 0f8c78dcd38..cf6ab147a2a 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	s = strstr(saved_command_line, "earlyprintk=");
 	if (s != NULL)
 		setup_early_printk(s);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	s = strstr(saved_command_line, "numa=");
 	if (s != NULL)
 		numa_setup(s+5);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 125a0bb1f5d..dd8419b37a7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -348,7 +348,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 		if (!memcmp(from, "mem=", 4))
 			parse_memopt(from+4, &from); 
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 		if (!memcmp(from, "numa=", 5))
 			numa_setup(from+5); 
 #endif
@@ -377,7 +377,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 	*cmdline_p = command_line;
 }
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NUMA
 static void __init contig_initmem_init(void)
 {
         unsigned long bootmap_size, bootmap; 
@@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p)
 	acpi_numa_init();
 #endif
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
 	numa_initmem_init(0, end_pfn); 
 #else
 	contig_initmem_init(); 
-- 
cgit v1.2.3


From bbfceef47fb9467424113a004070bf37a806a97c Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Thu, 23 Jun 2005 00:08:07 -0700
Subject: [PATCH] add x86-64 specific support for sparsemem

This patch adds in the necessary support for sparsemem such that x86-64
kernels may use sparsemem as an alternative to discontigmem for NUMA
kernels.  Note that this does no preclude one from continuing to build NUMA
kernels using discontigmem, but merely allows the option to build NUMA
kernels with sparsemem.

Interestingly, the use of sparsemem in lieu of discontigmem in NUMA kernels
results in reduced text size for otherwise equivalent kernels as shown in
the example builds below:

   text	   data	    bss	    dec	    hex	filename
2371036	 765884	1237108	4374028	 42be0c	vmlinux.discontig
2366549	 776484	1302772	4445805	 43d66d	vmlinux.sparse

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/setup.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index dd8419b37a7..000015dd5a8 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -40,6 +40,8 @@
 #include <linux/acpi.h>
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
+#include <linux/mmzone.h>
+
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -378,16 +380,19 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 }
 
 #ifndef CONFIG_NUMA
-static void __init contig_initmem_init(void)
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long bootmap_size, bootmap; 
-        bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
-        bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
-        if (bootmap == -1L) 
-                panic("Cannot find bootmem map of size %ld\n",bootmap_size);
-        bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
-        e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
-        reserve_bootmem(bootmap, bootmap_size);
+	unsigned long bootmap_size, bootmap;
+
+	memory_present(0, start_pfn, end_pfn);
+	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+	e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+	reserve_bootmem(bootmap, bootmap_size);
 } 
 #endif
 
@@ -557,7 +562,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_NUMA
 	numa_initmem_init(0, end_pfn); 
 #else
-	contig_initmem_init(); 
+	contig_initmem_init(0, end_pfn);
 #endif
 
 	/* Reserve direct mapping */
@@ -618,6 +623,8 @@ void __init setup_arch(char **cmdline_p)
 		}
 	}
 #endif
+
+	sparse_init();
 	paging_init();
 
 	check_ioapic();
-- 
cgit v1.2.3


From 8c5a09082f4e61a176382e96a831a0636b918602 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:18 -0700
Subject: [PATCH] x86/x86_64: pcibus_to_node

Define pcibus_to_node to be able to figure out which NUMA node contains a
given PCI device.  This defines pcibus_to_node(bus) in
include/linux/topology.h and adjusts the macros for i386 and x86_64 that
already provided a way to determine the cpumask of a pci device.

x86_64 was changed to not build an array of cpumasks anymore.  Instead an
array of nodes is build which can be used to generate the cpumask via
node_to_cpumask.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 61a63be6b29..ed6a5588146 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -23,6 +23,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi.h>
+#include <linux/module.h>
 
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -45,7 +46,8 @@ int acpi_found_madt;
 int apic_version [MAX_APICS];
 unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+unsigned char pci_bus_to_node [256];
+EXPORT_SYMBOL(pci_bus_to_node);
 
 static int mp_current_pci_id = 0;
 /* I/O APIC entries */
-- 
cgit v1.2.3


From 799d19f6ec5ca2102c61122f5219a17f1c4e961a Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Thu, 23 Jun 2005 00:08:24 -0700
Subject: [PATCH] allow early printk to use more than 25 lines

Allow early printk code to take advantage of the full size of the screen, not
just the first 25 lines.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/early_printk.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index e3a19e8ebbf..9631c747c5e 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -2,20 +2,24 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/tty.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 
 /* Simple VGA output */
 
 #ifdef __i386__
+#include <asm/setup.h>
 #define VGABASE		(__ISA_IO_base + 0xb8000)
 #else
+#include <asm/bootsetup.h>
 #define VGABASE		((void __iomem *)0xffffffff800b8000UL)
 #endif
 
-#define MAX_YPOS	25
-#define MAX_XPOS	80
+#define MAX_YPOS	max_ypos
+#define MAX_XPOS	max_xpos
 
+static int max_ypos = 25, max_xpos = 80;
 static int current_ypos = 1, current_xpos = 0; 
 
 static void early_vga_write(struct console *con, const char *str, unsigned n)
@@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt)
 	} else if (!strncmp(buf, "ttyS", 4)) { 
 		early_serial_init(buf);
 		early_console = &early_serial_console;		
-	} else if (!strncmp(buf, "vga", 3)) {
+	} else if (!strncmp(buf, "vga", 3)
+	           && SCREEN_INFO.orig_video_isVGA == 1) {
+		max_xpos = SCREEN_INFO.orig_video_cols;
+		max_ypos = SCREEN_INFO.orig_video_lines;
 		early_console = &early_vga_console; 
 	}
 	early_console_initialized = 1;
-- 
cgit v1.2.3


From c0a88c987878e533fc21fbf684198021a3b2c279 Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@telia.com>
Date: Thu, 23 Jun 2005 00:08:35 -0700
Subject: [PATCH] x86_64: i8259.c iso99 structure initialization

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 7873d9ba881..19eafa0aa95 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq)
 }
 
 static struct hw_interrupt_type i8259A_irq_type = {
-	"XT-PIC",
-	startup_8259A_irq,
-	shutdown_8259A_irq,
-	enable_8259A_irq,
-	disable_8259A_irq,
-	mask_and_ack_8259A,
-	end_8259A_irq,
-	NULL
+	.typename = "XT-PIC",
+	.startup = startup_8259A_irq,
+	.shutdown = shutdown_8259A_irq,
+	.enable = enable_8259A_irq,
+	.disable = disable_8259A_irq,
+	.ack = mask_and_ack_8259A,
+	.end = end_8259A_irq,
 };
 
 /*
-- 
cgit v1.2.3


From a3a00751ad8970c13d0563c2e92ee68c655a8e6b Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Thu, 23 Jun 2005 00:08:36 -0700
Subject: [PATCH] x86_64: fix hpet for systems that don't support legacy
 replacement

Currently the x86-64 HPET code assumes the entire HPET implementation from
the spec is present.  This breaks on boxes that do not implement the
optional legacy timer replacement functionality portion of the spec.

This patch fixes this issue, allowing x86-64 systems that cannot use the
HPET for the timer interrupt and RTC to still use the HPET as a time
source.  I've tested this patch on a system systems without HPET, with HPET
but without legacy timer replacement, as well as HPET with legacy timer
replacement.

This version adds a minor check to cap the HPET counter value in
gettimeoffset_hpet to avoid possible time inconsistencies.  Please ignore
the A2 version I sent to you earlier.

Acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/time.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index fb8c809b4cd..66bf6ddeb0c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -64,6 +64,7 @@ static int notsc __initdata = 0;
 unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
+static int hpet_use_timer;
 unsigned long vxtime_hz = PIT_TICK_RATE;
 int report_lost_ticks;				/* command line option */
 unsigned long long monotonic_base;
@@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void)
 
 static inline unsigned int do_gettimeoffset_hpet(void)
 {
-	return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+	/* cap counter read to one tick to avoid inconsistencies */
+	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
 }
 
 unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void)
 
 			last_offset = vxtime.last;
 			base = monotonic_base;
-			this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+			this_offset = hpet_readl(HPET_COUNTER);
 
 		} while (read_seqretry(&xtime_lock, seq));
 		offset = (this_offset - last_offset);
@@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 	write_seqlock(&xtime_lock);
 
-	if (vxtime.hpet_address) {
+	if (vxtime.hpet_address)
+		offset = hpet_readl(HPET_COUNTER);
+
+	if (hpet_use_timer) {
+		/* if we're using the hpet timer functionality,
+		 * we can more accurately know the counter value
+		 * when the timer interrupt occured.
+		 */
 		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
 		delay = hpet_readl(HPET_COUNTER) - offset;
 	} else {
@@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick)
  * Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
  * and period also hpet_tick.
  */
-
-	hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+	if (hpet_use_timer) {
+		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
 		    HPET_TN_32BIT, HPET_T0_CFG);
-	hpet_writel(hpet_tick, HPET_T0_CMP);
-	hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
-
+		hpet_writel(hpet_tick, HPET_T0_CMP);
+		hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+		cfg |= HPET_CFG_LEGACY;
+	}
 /*
  * Go!
  */
 
-	cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+	cfg |= HPET_CFG_ENABLE;
 	hpet_writel(cfg, HPET_CFG);
 
 	return 0;
@@ -834,8 +845,7 @@ static int hpet_init(void)
 
 	id = hpet_readl(HPET_ID);
 
-	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
-	    !(id & HPET_ID_LEGSUP))
+	if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
 		return -1;
 
 	hpet_period = hpet_readl(HPET_PERIOD);
@@ -845,6 +855,8 @@ static int hpet_init(void)
 	hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
 		hpet_period;
 
+	hpet_use_timer = (id & HPET_ID_LEGSUP);
+
 	return hpet_timer_stop_set_go(hpet_tick);
 }
 
@@ -901,9 +913,11 @@ void __init time_init(void)
 	set_normalized_timespec(&wall_to_monotonic,
 	                        -xtime.tv_sec, -xtime.tv_nsec);
 
-	if (!hpet_init()) {
+	if (!hpet_init())
                 vxtime_hz = (1000000000000000L + hpet_period / 2) /
 			hpet_period;
+
+	if (hpet_use_timer) {
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
 #ifdef CONFIG_X86_PM_TIMER
@@ -968,7 +982,7 @@ void __init time_init_gtod(void)
 	if (unsynchronized_tsc())
 		notsc = 1;
 	if (vxtime.hpet_address && notsc) {
-		timetype = "HPET";
+		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
 		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
 		vxtime.mode = VXTIME_HPET;
 		do_gettimeoffset = do_gettimeoffset_hpet;
@@ -983,7 +997,7 @@ void __init time_init_gtod(void)
 		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
 #endif
 	} else {
-		timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
 		vxtime.mode = VXTIME_TSC;
 	}
 
-- 
cgit v1.2.3


From 0928d6ef7f204979749fb241a90a04a35dae133a Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 23 Jun 2005 00:08:37 -0700
Subject: [PATCH] x86_64: never block forced SIGSEGV

This is the x86_64 version of the signal fix I just posted for i386.

This problem was first noticed on PPC and has already been fixed there.
But the exact same issue applies to other platforms in the same way.  The
signal blocking for sa_mask and the handled signal takes place after the
handler setup.  When the stack is bogus, the handler setup forces a
SIGSEGV.  But then this will be blocked, and returning to user mode will
fault again and iterate.  This patch fixes the problem by checking whether
signal handler setup failed, and not doing the signal-blocking if so.  This
copies what was done in the ppc code.  I think all architectures' signal
handler setup code follows this pattern and needs the change.

Signed-off-by: Roland McGrath <roland@redhat.com>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 429c0269dc4..3c5f3089371 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -34,9 +34,9 @@
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
 
-void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
                sigset_t *set, struct pt_regs * regs); 
-void ia32_setup_frame(int sig, struct k_sigaction *ka,
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
             sigset_t *set, struct pt_regs * regs); 
 
 asmlinkage long
@@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
 	return (void __user *)round_down(rsp - size, 16); 
 }
 
-static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 			   sigset_t *set, struct pt_regs * regs)
 {
 	struct rt_sigframe __user *frame;
@@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		current->comm, current->pid, frame, regs->rip, frame->pretcode);
 #endif
 
-	return;
+	return 1;
 
 give_sigsegv:
 	force_sigsegv(sig, current);
+	return 0;
 }
 
 /*
  * OK, we're invoking a handler
  */	
 
-static void
+static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		sigset_t *oldset, struct pt_regs *regs)
 {
+	int ret;
+
 #ifdef DEBUG_SIG
 	printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
 		current->pid, sig,
@@ -384,20 +387,22 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 #ifdef CONFIG_IA32_EMULATION
 	if (test_thread_flag(TIF_IA32)) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
-			ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+			ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
 		else
-			ia32_setup_frame(sig, ka, oldset, regs);
+			ret = ia32_setup_frame(sig, ka, oldset, regs);
 	} else 
 #endif
-	setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, oldset, regs);
 
-	if (!(ka->sa.sa_flags & SA_NODEFER)) {
+	if (ret && !(ka->sa.sa_flags & SA_NODEFER)) {
 		spin_lock_irq(&current->sighand->siglock);
 		sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
 		sigaddset(&current->blocked,sig);
 		recalc_sigpending();
 		spin_unlock_irq(&current->sighand->siglock);
 	}
+
+	return ret;
 }
 
 /*
@@ -437,8 +442,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 			asm volatile("movq %0,%%db7"	: : "r" (current->thread.debugreg7));
 
 		/* Whee!  Actually deliver the signal.  */
-		handle_signal(signr, &info, &ka, oldset, regs);
-		return 1;
+		return handle_signal(signr, &info, &ka, oldset, regs);
 	}
 
  no_signal:
-- 
cgit v1.2.3


From 701067c4661ebcdc155cc8f696acb24c016c058b Mon Sep 17 00:00:00 2001
From: Natalie Protasevich <Natalie.Protasevich@unisys.com>
Date: Thu, 23 Jun 2005 00:08:41 -0700
Subject: [PATCH] x86_64: avoid wasting IRQs

I suggest to change the way IRQs are handed out to PCI devices.

Currently, each I/O APIC pin gets associated with an IRQ, no matter if the
pin is used or not.  It is expected that each pin can potentually be
engaged by a device inserted into the corresponding PCI slot.  However,
this imposes severe limitation on systems that have designs that employ
many I/O APICs, only utilizing couple lines of each, such as P64H2 chipset.

It is used in ES7000, and currently, there is no way to boot the system
with more that 9 I/O APICs.

The simple change below allows to boot a system with say 64 (or more) I/O
APICs, each providing 1 slot, which otherwise impossible because of the IRQ
gaps created for unused lines on each I/O APIC.  It does not resolve the
problem with number of devices that exceeds number of possible IRQs, but
eases up a tension for IRQs on any large system with potentually large
number of devices.

I only implemented this for the ACPI boot, since if the system is this big
and using newer chipsets it is probably (better be!) an ACPI based system
:).  The change is completely "mechanical" and does not alter any internal
structures or interrupt model/implementation.  The patch works for both
i386 and x86_64 archs.  It works with MSIs just fine, and should not
intervene with implementations like shared vectors, when they get worked
out and incorporated.

To illustrate, below is the interrupt distribution for 2-cell ES7000 with
20 I/O APICs, and an Ethernet card in the last slot, which should be eth1
and which was not configured because its IRQ exceeded allowable number (it
actially turned out huge - 480!):

zorro-tb2:~ # cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7
  0:      65716      30012      30007      30002      30009      30010      30010      30010    IO-APIC-edge  timer
  4:        373          0        725        280          0          0          0          0    IO-APIC-edge  serial
  8:          0          0          0          0          0          0          0          0    IO-APIC-edge  rtc
  9:          0          0          0          0          0          0          0          0   IO-APIC-level  acpi
 14:         39          3          0          0          0          0          0          0    IO-APIC-edge  ide0
 16:        108         13          0          0          0          0          0          0   IO-APIC-level  uhci_hcd:usb1
 18:          0          0          0          0          0          0          0          0   IO-APIC-level  uhci_hcd:usb3
 19:         15          0          0          0          0          0          0          0   IO-APIC-level  uhci_hcd:usb2
 23:          3          0          0          0          0          0          0          0   IO-APIC-level  ehci_hcd:usb4
 96:       4240        397         18          0          0          0          0          0   IO-APIC-level  aic7xxx
 97:         15          0          0          0          0          0          0          0   IO-APIC-level  aic7xxx
192:        847          0          0          0          0          0          0          0   IO-APIC-level  eth0
NMI:          0          0          0          0          0          0          0          0
LOC:     273423     274528     272829     274228     274092     273761     273827     273694
ERR:          7
MIS:          0

Even though the system doesn't have that many devices, some don't get
enabled only because of IRQ numbering model.

This is the IRQ picture after the patch was applied:

zorro-tb2:~ # cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7
  0:      44169      10004      10004      10001      10004      10003      10004       6135    IO-APIC-edge  timer
  4:        345          0          0          0          0        244          0          0    IO-APIC-edge  serial
  8:          0          0          0          0          0          0          0          0    IO-APIC-edge  rtc
  9:          0          0          0          0          0          0          0          0   IO-APIC-level  acpi
 14:         39          0          3          0          0          0          0          0    IO-APIC-edge  ide0
 17:       4425          0          9          0          0          0          0          0   IO-APIC-level  aic7xxx
 18:         15          0          0          0          0          0          0          0   IO-APIC-level  aic7xxx, uhci_hcd:usb3
 21:        231          0          0          0          0          0          0          0   IO-APIC-level  uhci_hcd:usb1
 22:         26          0          0          0          0          0          0          0   IO-APIC-level  uhci_hcd:usb2
 23:          3          0          0          0          0          0          0          0   IO-APIC-level  ehci_hcd:usb4
 24:        348          0          0          0          0          0          0          0   IO-APIC-level  eth0
 25:          6        192          0          0          0          0          0          0   IO-APIC-level  eth1
NMI:          0          0          0          0          0          0          0          0
LOC:     107981     107636     108899     108698     108489     108326     108331     108254
ERR:          7
MIS:          0

Not only we see the card in the last I/O APIC, but we are not even close to
using up available IRQs, since we didn't waste any.

Signed-off-by: Natalie Protasevich <Natalie.Protasevich@unisys.com>
Acked-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mpparse.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index ed6a5588146..9c5aa2a790c 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -906,11 +906,20 @@ void __init mp_config_acpi_legacy_irqs (void)
 	return;
 }
 
+#define MAX_GSI_NUM	4096
+
 int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
 {
 	int			ioapic = -1;
 	int			ioapic_pin = 0;
 	int			idx, bit = 0;
+	static int		pci_irq = 16;
+	/*
+	 * Mapping between Global System Interrupts, which
+	 * represent all possible interrupts, to the IRQs
+	 * assigned to actual devices.
+	 */
+	static int		gsi_to_irq[MAX_GSI_NUM];
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
@@ -945,11 +954,21 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
 	if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
 		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
 			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi;
+		return gsi_to_irq[gsi];
 	}
 
 	mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
 
+	if (edge_level) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		gsi = pci_irq++;
+		gsi_to_irq[irq] = gsi;
+	}
+
 	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
 		edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
 		active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
-- 
cgit v1.2.3


From e9129e56e9ec50c0689eb4cf7a3ca132f1e776db Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:46 -0700
Subject: [PATCH] xen: x86_64: Add macro for debugreg

Add 2 macros to set and get debugreg on x86_64.  This is useful for Xen
because it will need only to redefine each macro to a hypervisor call.

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 2 +-
 arch/x86_64/kernel/traps.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 3c5f3089371..0446e8b96a2 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -439,7 +439,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 		 * inside the kernel.
 		 */
 		if (current->thread.debugreg7)
-			asm volatile("movq %0,%%db7"	: : "r" (current->thread.debugreg7));
+			set_debugreg(current->thread.debugreg7, 7);
 
 		/* Whee!  Actually deliver the signal.  */
 		return handle_signal(signr, &info, &ka, oldset, regs);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 3dfec8fdabc..21706c1a0f9 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -669,7 +669,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
        }
 #endif
 
-	asm("movq %%db6,%0" : "=r" (condition));
+	get_debugreg(condition, 6);
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 						SIGTRAP) == NOTIFY_STOP)
@@ -721,7 +721,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
 	info.si_addr = (void __user *)regs->rip;
 	force_sig_info(SIGTRAP, &info, tsk);	
 clear_dr7:
-	asm volatile("movq %0,%%db7"::"r"(0UL));
+	set_debugreg(0UL, 7);
 	return;
 
 clear_TF_reenable:
-- 
cgit v1.2.3


From 76381fee7e8feb4c22be636aa5d4765dbe4fbf9e Mon Sep 17 00:00:00 2001
From: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Date: Thu, 23 Jun 2005 00:08:46 -0700
Subject: [PATCH] xen: x86_64: use more usermode macro

Make use of the user_mode macro where it's possible.  This is useful for Xen
because it will need only to redefine only the macro to a hypervisor call.

Signed-off-by: Vincent Hanquez <vincent.hanquez@cl.cam.ac.uk>
Cc: Ian Pratt <m+Ian.Pratt@cl.cam.ac.uk>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c |  2 +-
 arch/x86_64/kernel/traps.c  | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 0446e8b96a2..98b7ba95d58 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -422,7 +422,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	 * kernel mode. Just return without doing anything
 	 * if so.
 	 */
-	if ((regs->cs & 3) != 3)
+	if (!user_mode(regs))
 		return 1;
 
 	if (try_to_freeze(0))
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 21706c1a0f9..121646fc43f 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack);
 void show_registers(struct pt_regs *regs)
 {
 	int i;
-	int in_kernel = (regs->cs & 3) == 0;
+	int in_kernel = !user_mode(regs);
 	unsigned long rsp;
 	const int cpu = safe_smp_processor_id(); 
 	struct task_struct *cur = cpu_pda[cpu].pcurrent; 
@@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs)
 	struct bug_frame f;
 	char tmp;
 
-	if (regs->cs & 3)
+	if (user_mode(regs))
 		return; 
 	if (__copy_from_user(&f, (struct bug_frame *) regs->rip, 
 			     sizeof(struct bug_frame)))
@@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str,
        }
 #endif
 
-	if ((regs->cs & 3)  != 0) { 
+	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
 		if (exception_trace && unhandled_signal(tsk, signr))
@@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
        }
 #endif
 
-	if ((regs->cs & 3)!=0) { 
+	if (user_mode(regs)) {
 		struct task_struct *tsk = current;
 
 		if (exception_trace && unhandled_signal(tsk, SIGSEGV))
@@ -638,7 +638,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
 	if (eregs == (struct pt_regs *)eregs->rsp)
 		;
 	/* Exception from user space */
-	else if (eregs->cs & 3)
+	else if (user_mode(eregs))
 		regs = ((struct pt_regs *)current->thread.rsp0) - 1;
 	/* Exception from kernel and interrupts are enabled. Move to
  	   kernel process stack. */
@@ -697,7 +697,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
 		 * allowing programs to debug themselves without the ptrace()
 		 * interface.
 		 */
-                if ((regs->cs & 3) == 0)
+                if (!user_mode(regs))
                        goto clear_TF_reenable;
 		/*
 		 * Was the TF flag set by a debugger? If so, clear it now,
@@ -715,7 +715,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
 	info.si_signo = SIGTRAP;
 	info.si_errno = 0;
 	info.si_code = TRAP_BRKPT;
-	if ((regs->cs & 3) == 0) 
+	if (!user_mode(regs))
 		goto clear_dr7; 
 
 	info.si_addr = (void __user *)regs->rip;
@@ -756,7 +756,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
 	unsigned short cwd, swd;
 
 	conditional_sti(regs);
-	if ((regs->cs & 3) == 0 &&
+	if (!user_mode(regs) &&
 	    kernel_math_error(regs, "kernel x87 math error"))
 		return;
 
@@ -822,7 +822,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
 	unsigned short mxcsr;
 
 	conditional_sti(regs);
-	if ((regs->cs & 3) == 0 &&
+	if (!user_mode(regs) &&
         	kernel_math_error(regs, "kernel simd math error"))
 		return;
 
-- 
cgit v1.2.3


From 73649dab0fd524cb8545a8cb83c6eaf77b107105 Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:23 -0700
Subject: [PATCH] x86_64 specific function return probes

The following patch adds the x86_64 architecture specific implementation
for function return probes.

Function return probes is a mechanism built on top of kprobes that allows
a caller to register a handler to be called when a given function exits.
For example, to instrument the return path of sys_mkdir:

static int sys_mkdir_exit(struct kretprobe_instance *i, struct pt_regs *regs)
{
	printk("sys_mkdir exited\n");
	return 0;
}
static struct kretprobe return_probe = {
	.handler = sys_mkdir_exit,
};

<inside setup function>

return_probe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("sys_mkdir");
if (register_kretprobe(&return_probe)) {
	printk(KERN_DEBUG "Unable to register return probe!\n");
	/* do error path */
}

<inside cleanup function>
unregister_kretprobe(&return_probe);

The way this works is that:

* At system initialization time, kernel/kprobes.c installs a kprobe
  on a function called kretprobe_trampoline() that is implemented in
  the arch/x86_64/kernel/kprobes.c  (More on this later)

* When a return probe is registered using register_kretprobe(),
  kernel/kprobes.c will install a kprobe on the first instruction of the
  targeted function with the pre handler set to arch_prepare_kretprobe()
  which is implemented in arch/x86_64/kernel/kprobes.c.

* arch_prepare_kretprobe() will prepare a kretprobe instance that stores:
  - nodes for hanging this instance in an empty or free list
  - a pointer to the return probe
  - the original return address
  - a pointer to the stack address

  With all this stowed away, arch_prepare_kretprobe() then sets the return
  address for the targeted function to a special trampoline function called
  kretprobe_trampoline() implemented in arch/x86_64/kernel/kprobes.c

* The kprobe completes as normal, with control passing back to the target
  function that executes as normal, and eventually returns to our trampoline
  function.

* Since a kprobe was installed on kretprobe_trampoline() during system
  initialization, control passes back to kprobes via the architecture
  specific function trampoline_probe_handler() which will lookup the
  instance in an hlist maintained by kernel/kprobes.c, and then call
  the handler function.

* When trampoline_probe_handler() is done, the kprobes infrastructure
  single steps the original instruction (in this case just a top), and
  then calls trampoline_post_handler().  trampoline_post_handler() then
  looks up the instance again, puts the instance back on the free list,
  and then makes a long jump back to the original return instruction.

So to recap, to instrument the exit path of a function this implementation
will cause four interruptions:

  - A breakpoint at the very beginning of the function allowing us to
    switch out the return address
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)
  - A breakpoint in the trampoline function where our instrumented function
    returned to
  - A single step interruption to execute the original instruction that
    we replaced with the break instruction (normal kprobe flow)

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 98 +++++++++++++++++++++++++++++++++++++++++++-
 arch/x86_64/kernel/process.c | 16 ++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index f77f8a0ff18..203672ca740 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -27,6 +27,8 @@
  *		<prasanna@in.ibm.com> adapted for x86_64
  * 2005-Mar	Roland McGrath <roland@redhat.com>
  *		Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
+ *              Added function return probes functionality
  */
 
 #include <linux/config.h>
@@ -240,6 +242,50 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->rip = (unsigned long)p->ainsn.insn;
 }
 
+struct task_struct  *arch_get_kprobe_task(void *ptr)
+{
+	return ((struct thread_info *) (((unsigned long) ptr) &
+					(~(THREAD_SIZE -1))))->task;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+	unsigned long *sara = (unsigned long *)regs->rsp;
+	struct kretprobe_instance *ri;
+	static void *orig_ret_addr;
+
+	/*
+	 * Save the return address when the return probe hits
+	 * the first time, and use it to populate the (krprobe
+	 * instance)->ret_addr for subsequent return probes at
+	 * the same addrress since stack address would have
+	 * the kretprobe_trampoline by then.
+	 */
+	if (((void*) *sara) != kretprobe_trampoline)
+		orig_ret_addr = (void*) *sara;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->stack_addr = sara;
+		ri->ret_addr = orig_ret_addr;
+		add_rp_inst(ri);
+		/* Replace the return addr with trampoline addr */
+		*sara = (unsigned long) &kretprobe_trampoline;
+	} else {
+		rp->nmissed++;
+	}
+}
+
+void arch_kprobe_flush_task(struct task_struct *tk)
+{
+	struct kretprobe_instance *ri;
+	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
+		*((unsigned long *)(ri->stack_addr)) =
+					(unsigned long) ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+}
+
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
@@ -316,6 +362,55 @@ no_kprobe:
 	return ret;
 }
 
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ 	asm volatile (  ".global kretprobe_trampoline\n"
+ 			"kretprobe_trampoline: \n"
+ 			"nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned long *sara = (unsigned long *)regs->rsp - 1;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = kretprobe_inst_table_head(tsk);
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara && ri->rp) {
+			if (ri->rp->handler)
+				ri->rp->handler(ri, regs);
+		}
+	}
+	return 0;
+}
+
+void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+						unsigned long flags)
+{
+	struct kretprobe_instance *ri;
+	/* RA already popped */
+	unsigned long *sara = ((unsigned long *)regs->rsp) - 1;
+
+	while ((ri = get_rp_inst(sara))) {
+		regs->rip = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	regs->eflags &= ~TF_MASK;
+}
+
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "int 3"
@@ -404,7 +499,8 @@ int post_kprobe_handler(struct pt_regs *regs)
 	if (current_kprobe->post_handler)
 		current_kprobe->post_handler(current_kprobe, regs, 0);
 
-	resume_execution(current_kprobe, regs);
+	if (current_kprobe->post_handler != trampoline_post_handler)
+		resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_rflags;
 
 	unlock_kprobes();
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dce8bab4306..e59d1f9d616 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -34,6 +34,7 @@
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -293,6 +294,14 @@ void exit_thread(void)
 {
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
+
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(me);
+
 	if (me->thread.io_bitmap_ptr) { 
 		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
 
@@ -312,6 +321,13 @@ void flush_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_info *t = current_thread_info();
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
 
-- 
cgit v1.2.3


From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:25 -0700
Subject: [PATCH] Move kprobe [dis]arming into arch specific code

The architecture independent code of the current kprobes implementation is
arming and disarming kprobes at registration time.  The problem is that the
code is assuming that arming and disarming is a just done by a simple write
of some magic value to an address.  This is problematic for ia64 where our
instructions look more like structures, and we can not insert break points
by just doing something like:

*p->addr = BREAKPOINT_INSTRUCTION;

The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent
functions:

     * void arch_arm_kprobe(struct kprobe *p)
     * void arch_disarm_kprobe(struct kprobe *p)

and then adds the new functions for each of the architectures that already
implement kprobes (spar64/ppc64/i386/x86_64).

I thought arch_[dis]arm_kprobe was the most descriptive of what was really
happening, but each of the architectures already had a disarm_kprobe()
function that was really a "disarm and do some other clean-up items as
needed when you stumble across a recursive kprobe." So...  I took the
liberty of changing the code that was calling disarm_kprobe() to call
arch_disarm_kprobe(), and then do the cleanup in the block of code dealing
with the recursive kprobe case.

So far this patch as been tested on i386, x86_64, and ppc64, but still
needs to be tested in sparc64.

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 203672ca740..324bf57925a 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -39,7 +39,7 @@
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/moduleloader.h>
-
+#include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
 
@@ -216,19 +216,28 @@ void arch_copy_kprobe(struct kprobe *p)
 		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
 		*ripdisp = disp;
 	}
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
-	up(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
-	down(&kprobe_mutex);
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->rip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+	up(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	down(&kprobe_mutex);
 }
 
 static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -311,7 +320,8 @@ int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->rip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
-- 
cgit v1.2.3


From aa3d7e3d782bbcf567b1da6101d8fbff7cf7e7a6 Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:37 -0700
Subject: [PATCH] kprobes: Temporary disarming of reentrant probe for x86_64

This patch includes x86_64 architecture specific changes to support temporary
disarming on reentrancy of probes.

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/kprobes.c | 76 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 62 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 324bf57925a..4e680f87a75 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -45,12 +45,10 @@
 
 static DECLARE_MUTEX(kprobe_mutex);
 
-/* kprobe_status settings */
-#define KPROBE_HIT_ACTIVE	0x00000001
-#define KPROBE_HIT_SS		0x00000002
-
 static struct kprobe *current_kprobe;
 static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct kprobe *kprobe_prev;
+static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
 static struct pt_regs jprobe_saved_regs;
 static long *jprobe_saved_rsp;
 static kprobe_opcode_t *get_insn_slot(void);
@@ -240,6 +238,31 @@ void arch_remove_kprobe(struct kprobe *p)
 	down(&kprobe_mutex);
 }
 
+static inline void save_previous_kprobe(void)
+{
+	kprobe_prev = current_kprobe;
+	kprobe_status_prev = kprobe_status;
+	kprobe_old_rflags_prev = kprobe_old_rflags;
+	kprobe_saved_rflags_prev = kprobe_saved_rflags;
+}
+
+static inline void restore_previous_kprobe(void)
+{
+	current_kprobe = kprobe_prev;
+	kprobe_status = kprobe_status_prev;
+	kprobe_old_rflags = kprobe_old_rflags_prev;
+	kprobe_saved_rflags = kprobe_saved_rflags_prev;
+}
+
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	current_kprobe = p;
+	kprobe_saved_rflags = kprobe_old_rflags
+		= (regs->eflags & (TF_MASK | IF_MASK));
+	if (is_IF_modifier(p->ainsn.insn))
+		kprobe_saved_rflags &= ~IF_MASK;
+}
+
 static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
 	regs->eflags |= TF_MASK;
@@ -319,10 +342,30 @@ int kprobe_handler(struct pt_regs *regs)
 				regs->eflags |= kprobe_saved_rflags;
 				unlock_kprobes();
 				goto no_kprobe;
+			} else if (kprobe_status == KPROBE_HIT_SSDONE) {
+				/* TODO: Provide re-entrancy from
+				 * post_kprobes_handler() and avoid exception
+				 * stack corruption while single-stepping on
+				 * the instruction of the new probe.
+				 */
+				arch_disarm_kprobe(p);
+				regs->rip = (unsigned long)p->addr;
+				ret = 1;
+			} else {
+				/* We have reentered the kprobe_handler(), since
+				 * another probe was hit while within the
+				 * handler. We here save the original kprobe
+				 * variables and just single step on instruction
+				 * of the new probe without calling any user
+				 * handlers.
+				 */
+				save_previous_kprobe();
+				set_current_kprobe(p, regs);
+				p->nmissed++;
+				prepare_singlestep(p, regs);
+				kprobe_status = KPROBE_REENTER;
+				return 1;
 			}
-			arch_disarm_kprobe(p);
-			regs->rip = (unsigned long)p->addr;
-			ret = 1;
 		} else {
 			p = current_kprobe;
 			if (p->break_handler && p->break_handler(p, regs)) {
@@ -352,11 +395,7 @@ int kprobe_handler(struct pt_regs *regs)
 	}
 
 	kprobe_status = KPROBE_HIT_ACTIVE;
-	current_kprobe = p;
-	kprobe_saved_rflags = kprobe_old_rflags
-	    = (regs->eflags & (TF_MASK | IF_MASK));
-	if (is_IF_modifier(p->ainsn.insn))
-		kprobe_saved_rflags &= ~IF_MASK;
+	set_current_kprobe(p, regs);
 
 	if (p->pre_handler && p->pre_handler(p, regs))
 		/* handler has already set things up, so skip ss setup */
@@ -506,14 +545,23 @@ int post_kprobe_handler(struct pt_regs *regs)
 	if (!kprobe_running())
 		return 0;
 
-	if (current_kprobe->post_handler)
+	if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
+		kprobe_status = KPROBE_HIT_SSDONE;
 		current_kprobe->post_handler(current_kprobe, regs, 0);
+	}
 
 	if (current_kprobe->post_handler != trampoline_post_handler)
 		resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_rflags;
 
-	unlock_kprobes();
+	/* Restore the original saved kprobes variables and continue. */
+	if (kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe();
+		goto out;
+	} else {
+		unlock_kprobes();
+	}
+out:
 	preempt_enable_no_resched();
 
 	/*
-- 
cgit v1.2.3


From e6982c671c560da4a0bc5c908cbcbec12bd5991d Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:54:58 -0700
Subject: [PATCH] x86_64: Change init sections for CPU hotplug support

This patch adds __cpuinit and __cpuinitdata sections that need to exist past
boot to support cpu hotplug.

Caveat: This is done *only* for EM64T CPU Hotplug support, on request from
Andi Kleen.  Much of the generic hotplug code in kernel, and none of the other
archs that support CPU hotplug today, i386, ia64, ppc64, s390 and parisc dont
mark sections with __cpuinit, but only mark them as __devinit, and
__devinitdata.

If someone is motivated to change generic code, we need to make sure all
existing hotplug code does not break, on other arch's that dont use __cpuinit,
and __cpudevinit.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c      |  8 ++++----
 arch/x86_64/kernel/i387.c      |  2 +-
 arch/x86_64/kernel/mce.c       |  8 ++++----
 arch/x86_64/kernel/mce_intel.c |  4 ++--
 arch/x86_64/kernel/nmi.c       |  4 ++--
 arch/x86_64/kernel/process.c   |  2 +-
 arch/x86_64/kernel/setup.c     | 18 +++++++++---------
 arch/x86_64/kernel/setup64.c   |  6 +++---
 arch/x86_64/kernel/smpboot.c   | 15 +++++----------
 9 files changed, 31 insertions(+), 36 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd..2a6c893ccf6 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -285,7 +285,7 @@ void __init init_bsp_APIC(void)
 	apic_write_around(APIC_LVT1, value);
 }
 
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
 {
 	unsigned int value, ver, maxlvt;
 
@@ -534,7 +534,7 @@ static struct sys_device device_lapic = {
 	.cls		= &lapic_sysclass,
 };
 
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
 }
@@ -774,14 +774,14 @@ void __init setup_boot_APIC_clock (void)
 	local_irq_enable();
 }
 
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
 {
 	local_irq_disable(); /* FIXME: Do we need this? --RR */
 	setup_APIC_timer(calibration_result);
 	local_irq_enable();
 }
 
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
 {
 	if (using_apic_timer) {
 		unsigned long v;
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57c..d9b22b633e3 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
  */
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
 	extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f..7ab15c8ab95 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
 }
 
 /* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 { 
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
 	}
 }			
 
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off. 
  */
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
 	static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
 
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
 
-static __init int mce_init_device(void)
+static __cpuinit int mce_init_device(void)
 {
 	int err;
 	if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069..0be0a795981 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
 	irq_exit();
 }
 
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 }
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac9..4e44d6e6b7e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
 	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
 	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
 }
 
 /* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e59d1f9d616..15fa35a81a7 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -204,7 +204,7 @@ static void mwait_idle(void)
 	}
 }
 
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 	static int printed;
 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 000015dd5a8..ca9a915a6d4 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -676,7 +676,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 
@@ -692,7 +692,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
 }
 
 
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
 
@@ -803,7 +803,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	return r;
 }
 
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
@@ -864,7 +864,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
 /*
  * find out the number of processor cores on the die
  */
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 {
 	unsigned int eax;
 
@@ -882,7 +882,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
 		return 1;
 }
 
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;
@@ -902,7 +902,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
  	c->x86_num_cores = intel_num_cpu_cores(c);
 }
 
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
 
@@ -923,7 +923,7 @@ struct cpu_model_info {
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
 	u32 tfms;
 
@@ -977,7 +977,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 	u32 xlvl;
@@ -1054,7 +1054,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
 }
  
 
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
 		printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8..f2b9c6bc999 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
 
 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
 
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
 
@@ -171,7 +171,7 @@ void syscall_init(void)
 	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
 }
 
-void __init check_efer(void)
+void __cpuinit check_efer(void)
 {
 	unsigned long efer;
 
@@ -188,7 +188,7 @@ void __init check_efer(void)
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init.
  */
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
 {
 #ifdef CONFIG_SMP
 	int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f34594..bc98a6722cb 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -58,11 +58,6 @@
 #include <asm/proto.h>
 #include <asm/nmi.h>
 
-/* Change for real CPU hotplug. Note other files need to be fixed
-   first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
@@ -823,7 +818,7 @@ static __cpuinit void smp_cleanup_boot(void)
  *
  * RED-PEN audit/test this more. I bet there is more state messed up here.
  */
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
 {
 	cpu_present_map = cpumask_of_cpu(0);
 	cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +833,7 @@ static __cpuinit void disable_smp(void)
 /*
  * Handle user cpus=... parameter.
  */
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
 {
 	int i, k;
 	k = 0;
@@ -855,7 +850,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
 /*
  * Various sanity checks.
  */
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
 {
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +908,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
  * Prepare for SMP bootup.  The MP table or ACPI has been read
  * earlier.  Just do some sanity checking here and enable APIC mode.
  */
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int i;
 
@@ -1019,7 +1014,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 /*
  * Finish the SMP boot.
  */
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
 {
 	zap_low_mappings();
 	smp_cleanup_boot();
-- 
cgit v1.2.3


From 76e4f660d9f4c6d1bb473f72be2988c35eaca948 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:00 -0700
Subject: [PATCH] x86_64: CPU hotplug support

  Experimental CPU hotplug patch for x86_64
  -----------------------------------------
This supports logical CPU online and offline.
- Test with maxcpus=1, and then kick other cpu's off to test if init code
  is all cleaned up. CONFIG_SCHED_SMT works as well.
- idle threads are forked on demand from keventd threads for clean startup

TBD:
1. Not tested on a real NUMA machine (tested with numa=fake=2)
2. Handle ACPI pieces for physical hotplug support.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Shaohua.li<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/irq.c     |  29 ++++++
 arch/x86_64/kernel/process.c |  29 +++++-
 arch/x86_64/kernel/smpboot.c | 205 +++++++++++++++++++++++++++++++++++++++----
 arch/x86_64/kernel/traps.c   |   8 +-
 4 files changed, 254 insertions(+), 17 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb..cc3fb85f514 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <asm/io_apic.h>
 
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+	unsigned int irq;
+	static int warned;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		cpumask_t mask;
+		if (irq == 2)
+			continue;
+
+		cpus_and(mask, irq_affinity[irq], map);
+		if (any_online_cpu(mask) == NR_CPUS) {
+			printk("Breaking affinity for irq %i\n", irq);
+			mask = map;
+		}
+		if (irq_desc[irq].handler->set_affinity)
+			irq_desc[irq].handler->set_affinity(irq, mask);
+		else if (irq_desc[irq].action && !(warned++))
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 15fa35a81a7..1d91271796e 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
  * 
  *  X86-64 port
  *	Andi Kleen.
- * 
+ *
+ *	CPU hotplug support - ashok.raj@intel.com
  *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
  */
 
@@ -18,6 +19,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+	idle_task_exit();
+	wbinvd();
+	mb();
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	while (1)
+		safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 /*
  * The idle thread. There's no useful work to be
  * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
+			if (cpu_is_offline(smp_processor_id()))
+				play_dead();
 			idle();
 		}
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index bc98a6722cb..5a3f955b657 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
  *      Andi Kleen              :       Converted to new state machine.
  *					Various cleanups.
  *					Probably mostly hotplug CPU ready now.
+ *	Ashok Raj			: CPU hotplug support
  */
 
 
@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
 extern unsigned char trampoline_data[];
 extern unsigned char trampoline_end[];
 
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x)     (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p)   (idle_thread_array[(x)] = (p))
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define fixup_cpu_possible_map(x)	cpu_set((x), cpu_possible_map)
+#else
+#define fixup_cpu_possible_map(x)
+#endif
+
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
 	return (send_status | accept_status);
 }
 
+struct create_idle {
+	struct task_struct *idle;
+	struct completion done;
+	int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+	struct create_idle *c_idle = _c_idle;
+
+	c_idle->idle = fork_idle(c_idle->cpu);
+	complete(&c_idle->done);
+}
+
 /*
  * Boot one CPU.
  */
 static int __cpuinit do_boot_cpu(int cpu, int apicid)
 {
-	struct task_struct *idle;
 	unsigned long boot_error;
 	int timeout;
 	unsigned long start_rip;
+	struct create_idle c_idle = {
+		.cpu = cpu,
+		.done = COMPLETION_INITIALIZER(c_idle.done),
+	};
+	DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+	c_idle.idle = get_idle_for_cpu(cpu);
+
+	if (c_idle.idle) {
+		c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+			(THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+		init_idle(c_idle.idle, cpu);
+		goto do_rest;
+	}
+
 	/*
-	 * We can't use kernel_thread since we must avoid to
-	 * reschedule the child.
+	 * During cold boot process, keventd thread is not spun up yet.
+	 * When we do cpu hot-add, we create idle threads on the fly, we should
+	 * not acquire any attributes from the calling context. Hence the clean
+	 * way to create kernel_threads() is to do that from keventd().
+	 * We do the current_is_keventd() due to the fact that ACPI notifier
+	 * was also queuing to keventd() and when the caller is already running
+	 * in context of keventd(), we would end up with locking up the keventd
+	 * thread.
 	 */
-	idle = fork_idle(cpu);
-	if (IS_ERR(idle)) {
+	if (!keventd_up() || current_is_keventd())
+		work.func(work.data);
+	else {
+		schedule_work(&work);
+		wait_for_completion(&c_idle.done);
+	}
+
+	if (IS_ERR(c_idle.idle)) {
 		printk("failed fork for CPU %d\n", cpu);
-		return PTR_ERR(idle);
+		return PTR_ERR(c_idle.idle);
 	}
 
-	cpu_pda[cpu].pcurrent = idle;
+	set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+	cpu_pda[cpu].pcurrent = c_idle.idle;
 
 	start_rip = setup_trampoline();
 
-	init_rsp = idle->thread.rsp;
+	init_rsp = c_idle.idle->thread.rsp;
 	per_cpu(init_tss,cpu).rsp0 = init_rsp;
 	initial_code = start_secondary;
-	clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+	clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
 
 	printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
 	       start_rip, init_rsp);
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		int apicid = cpu_present_to_apicid(i);
 		if (physid_isset(apicid, phys_cpu_present_map)) {
 			cpu_set(i, cpu_present_map);
-			/* possible map would be different if we supported real
-			   CPU hotplug. */
 			cpu_set(i, cpu_possible_map);
 		}
+		fixup_cpu_possible_map(i);
 	}
 
 	if (smp_sanity_check(max_cpus) < 0) {
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void)
 
 /*
  * Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
  */
 int __cpuinit __cpu_up(unsigned int cpu)
 {
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
 		return -EINVAL;
 	}
 
+	/*
+	 * Already booted CPU?
+	 */
+ 	if (cpu_isset(cpu, cpu_callin_map)) {
+		Dprintk("do_boot_cpu %d Already started\n", cpu);
+ 		return -ENOSYS;
+	}
+
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
 
 	while (!cpu_isset(cpu, cpu_online_map))
 		cpu_relax();
-	return 0;
+	err = 0;
+
+	return err;
 }
 
 /*
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
  */
 void __init smp_cpus_done(unsigned int max_cpus)
 {
+#ifndef CONFIG_HOTPLUG_CPU
 	zap_low_mappings();
+#endif
 	smp_cleanup_boot();
 
 #ifdef CONFIG_X86_IO_APIC
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 	check_nmi_watchdog();
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void
+remove_siblinginfo(int cpu)
+{
+	int sibling;
+
+	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[sibling]);
+	for_each_cpu_mask(sibling, cpu_core_map[cpu])
+		cpu_clear(cpu, cpu_core_map[sibling]);
+	cpus_clear(cpu_sibling_map[cpu]);
+	cpus_clear(cpu_core_map[cpu]);
+	phys_proc_id[cpu] = BAD_APICID;
+	cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_clear(cpu, cpu_callout_map);
+	cpu_clear(cpu, cpu_callin_map);
+	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * Perhaps use cpufreq to drop frequency, but that could go
+	 * into generic code.
+ 	 *
+	 * We won't take down the boot processor on i386 due to some
+	 * interrupts only being able to be serviced by the BSP.
+	 * Especially so if we're not using an IOAPIC	-zwane
+	 */
+	if (cpu == 0)
+		return -EBUSY;
+
+	disable_APIC_timer();
+
+	/*
+	 * HACK:
+	 * Allow any queued timer interrupts to get serviced
+	 * This is only a temporary solution until we cleanup
+	 * fixup_irqs as we do for IA64.
+	 */
+	local_irq_enable();
+	mdelay(1);
+
+	local_irq_disable();
+	remove_siblinginfo(cpu);
+
+	/* It's now safe to remove this processor from the online map */
+	cpu_clear(cpu, cpu_online_map);
+	remove_cpu_from_maps();
+	fixup_irqs(cpu_online_map);
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We don't do anything here: idle task is faking death itself. */
+	unsigned int i;
+
+	for (i = 0; i < 10; i++) {
+		/* They ack this in play_dead by setting CPU_DEAD */
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+			return;
+		current->state = TASK_UNINTERRUPTIBLE;
+		schedule_timeout(HZ/10);
+	}
+ 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	/* We said "no" in __cpu_disable */
+	BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f..10273663000 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 asmlinkage void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
+	int cpu;
+
+	cpu = smp_processor_id();
 
 	/* Only the BSP gets external NMIs from the system.  */
-	if (!smp_processor_id())
+	if (!cpu)
 		reason = get_nmi_reason();
 
+	if (!cpu_online(cpu))
+		return;
+
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
 								== NOTIFY_STOP)
-- 
cgit v1.2.3


From cb0cd8d49a9b81aff7a02e2ed826b5cfdfe9a172 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:01 -0700
Subject: [PATCH] x86_64: CPU hotplug sibling map cleanup

This patch is a minor cleanup to the cpu sibling/core map.  It is required
that this setup happens on a per-cpu bringup time.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/smpboot.c | 84 +++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 48 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 5a3f955b657..571a55462fa 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -445,6 +445,33 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+static inline void set_cpu_sibling_map(int cpu)
+{
+	int i;
+
+	if (smp_num_siblings > 1) {
+		for_each_cpu(i) {
+			if (cpu_core_id[cpu] == cpu_core_id[i]) {
+				cpu_set(i, cpu_sibling_map[cpu]);
+				cpu_set(cpu, cpu_sibling_map[i]);
+			}
+		}
+	} else {
+		cpu_set(cpu, cpu_sibling_map[cpu]);
+	}
+
+	if (current_cpu_data.x86_num_cores > 1) {
+		for_each_cpu(i) {
+			if (phys_proc_id[cpu] == phys_proc_id[i]) {
+				cpu_set(i, cpu_core_map[cpu]);
+				cpu_set(cpu, cpu_core_map[i]);
+			}
+		}
+	} else {
+		cpu_core_map[cpu] = cpu_sibling_map[cpu];
+	}
+}
+
 /*
  * Setup code on secondary processor (after comming out of the trampoline)
  */
@@ -474,6 +501,12 @@ void __cpuinit start_secondary(void)
 
 	enable_APIC_timer();
 
+	/*
+	 * The sibling maps must be set before turing the online map on for
+	 * this cpu
+	 */
+	set_cpu_sibling_map(smp_processor_id());
+
 	/*
 	 * Allow the master to continue.
 	 */
@@ -816,51 +849,6 @@ do_rest:
 cycles_t cacheflush_time;
 unsigned long cache_decay_ticks;
 
-/*
- * Construct cpu_sibling_map[], so that we can tell the sibling CPU
- * on SMT systems efficiently.
- */
-static __cpuinit void detect_siblings(void)
-{
-	int cpu;
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		cpus_clear(cpu_sibling_map[cpu]);
-		cpus_clear(cpu_core_map[cpu]);
-	}
-
-	for_each_online_cpu (cpu) {
-		struct cpuinfo_x86 *c = cpu_data + cpu;
-		int siblings = 0;
-		int i;
-		if (smp_num_siblings > 1) {
-			for_each_online_cpu (i) {
-				if (cpu_core_id[cpu] == cpu_core_id[i]) {
-					siblings++;
-					cpu_set(i, cpu_sibling_map[cpu]);
-				}
-			}
-		} else {
-			siblings++;
-			cpu_set(cpu, cpu_sibling_map[cpu]);
-		}
-
-		if (siblings != smp_num_siblings) {
-			printk(KERN_WARNING
-	       "WARNING: %d siblings found for CPU%d, should be %d\n",
-			       siblings, cpu, smp_num_siblings);
-			smp_num_siblings = siblings;
-		}
-		if (c->x86_num_cores > 1) {
-			for_each_online_cpu(i) {
-				if (phys_proc_id[cpu] == phys_proc_id[i])
-					cpu_set(i, cpu_core_map[cpu]);
-			}
-		} else
-			cpu_core_map[cpu] = cpu_sibling_map[cpu];
-	}
-}
-
 /*
  * Cleanup possible dangling ends...
  */
@@ -1048,6 +1036,8 @@ void __init smp_prepare_boot_cpu(void)
 	int me = smp_processor_id();
 	cpu_set(me, cpu_online_map);
 	cpu_set(me, cpu_callout_map);
+	cpu_set(0, cpu_sibling_map[0]);
+	cpu_set(0, cpu_core_map[0]);
 }
 
 /*
@@ -1107,7 +1097,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	setup_ioapic_dest();
 #endif
 
-	detect_siblings();
 	time_init_gtod();
 
 	check_nmi_watchdog();
@@ -1115,8 +1104,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void
-remove_siblinginfo(int cpu)
+static void remove_siblinginfo(int cpu)
 {
 	int sibling;
 
-- 
cgit v1.2.3


From 884d9e40b4089014f40c49e86ac6505842db2b53 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:02 -0700
Subject: [PATCH] x86_64: Dont use broadcast shortcut to make it cpu hotplug
 safe.

Broadcast IPI's provide un-expected behaviour for cpu hotplug.  CPU's in
offline state also end up receiving the IPI.  Once the cpus become online they
receive these stale IPI's which are bad and introduce unexpected behaviour.

This is easily avoided by not sending a broadcast and addressing just the
CPU's in online map.  Doing prelim cycle counts it appears there is no big
overhead and numbers seem around 0x3000-0x3900 on an average on x86 and x86_64
systems with CPUS running 3G, both for broadcast and mask version of the
API's.

The shortcuts are useful only for flat mode (where the perf shows no
degradation), and in cluster mode, its unicast anyway.  Its simpler to just
not use broadcast anymore.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c | 42 ++++++++++++++++++++++++---------------
 arch/x86_64/kernel/smp.c          | 10 ++++++++++
 arch/x86_64/kernel/smpboot.c      | 19 +++++++++++++++++-
 3 files changed, 54 insertions(+), 17 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad0422..00f3fa6df71 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
  * Hacked for x86-64 by James Cleverdon from i386 architecture code by
  * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
  * James Cleverdon.
+ * Ashok Raj <ashok.raj@intel.com>
+ * 	Removed IPI broadcast shortcut to support CPU hotplug
  */
 #include <linux/config.h>
 #include <linux/threads.h>
@@ -45,22 +47,6 @@ static void flat_init_apic_ldr(void)
 	apic_write_around(APIC_LDR, val);
 }
 
-static void flat_send_IPI_allbutself(int vector)
-{
-	/*
-	 * if there are no other CPUs in the system then
-	 * we get an APIC send error if we try to broadcast.
-	 * thus we have to avoid sending IPIs in this case.
-	 */
-	if (num_online_cpus() > 1)
-		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
-	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
 static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 {
 	unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +79,30 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 	local_irq_restore(flags);
 }
 
+static void flat_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask;
+	/*
+	 * if there are no other CPUs in the system then
+	 * we get an APIC send error if we try to broadcast.
+	 * thus we have to avoid sending IPIs in this case.
+	 */
+	int this_cpu = get_cpu();
+
+	mask = cpu_online_map;
+	cpu_clear(this_cpu, mask);
+
+	if (cpus_weight(mask) >= 1)
+		flat_send_IPI_mask(mask, vector);
+
+	put_cpu();
+}
+
+static void flat_send_IPI_all(int vector)
+{
+	flat_send_IPI_mask(cpu_online_map, vector);
+}
+
 static int flat_apic_id_registered(void)
 {
 	return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1..ccae392886a 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
 
 static struct call_data_struct * call_data;
 
+void lock_ipi_call_lock(void)
+{
+	spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+	spin_unlock_irq(&call_lock);
+}
+
 /*
  * this function sends a 'generic call function' IPI to all other CPUs
  * in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 571a55462fa..b969ee12872 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -507,10 +507,23 @@ void __cpuinit start_secondary(void)
 	 */
 	set_cpu_sibling_map(smp_processor_id());
 
+	/*
+	 * We need to hold call_lock, so there is no inconsistency
+	 * between the time smp_call_function() determines number of
+	 * IPI receipients, and the time when the determination is made
+	 * for which cpus receive the IPI in genapic_flat.c. Holding this
+	 * lock helps us to not include this cpu in a currently in progress
+	 * smp_call_function().
+	 */
+	lock_ipi_call_lock();
+
 	/*
 	 * Allow the master to continue.
 	 */
 	cpu_set(smp_processor_id(), cpu_online_map);
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	unlock_ipi_call_lock();
+
 	mb();
 
 	/* Wait for TSC sync to not schedule things before.
@@ -1038,6 +1051,7 @@ void __init smp_prepare_boot_cpu(void)
 	cpu_set(me, cpu_callout_map);
 	cpu_set(0, cpu_sibling_map[0]);
 	cpu_set(0, cpu_core_map[0]);
+	per_cpu(cpu_state, me) = CPU_ONLINE;
 }
 
 /*
@@ -1066,6 +1080,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
  		return -ENOSYS;
 	}
 
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Boot it! */
 	err = do_boot_cpu(cpu, apicid);
 	if (err < 0) {
@@ -1170,8 +1185,10 @@ void __cpu_die(unsigned int cpu)
 
 	for (i = 0; i < 10; i++) {
 		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+			printk ("CPU %d is now offline\n", cpu);
 			return;
+		}
 		current->state = TASK_UNINTERRUPTIBLE;
 		schedule_timeout(HZ/10);
 	}
-- 
cgit v1.2.3


From a02c4cb67e4ccd5ce7a13c7f04c2fedb06c35431 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:55:03 -0700
Subject: [PATCH] x86_64: Provide ability to choose using shortcuts for IPI in
 flat mode.

This patch provides an option to switch broadcast or use mask version for
sending IPI's.  If CONFIG_HOTPLUG_CPU is defined, we choose not to use
broadcast shortcuts by default, otherwise we choose broadcast mode as default.

both cases, one can change this via startup cmd line option, to choose
no-broadcast mode.

	no_ipi_broadcast=1

This is provided on request from Andi Kleen, since he doesnt agree with
replacing IPI shortcuts as a solution for CPU hotplug.  Without removing
broadcast IPI's, it would mean lots of new code for __cpu_up() path, which
would acheive the same results.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/genapic_flat.c | 96 +++++++++++++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 14 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 00f3fa6df71..28284696508 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -20,6 +20,46 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 
+/*
+ * The following permit choosing broadcast IPI shortcut v.s sending IPI only
+ * to online cpus via the send_IPI_mask varient.
+ * The mask version is my preferred option, since it eliminates a lot of
+ * other extra code that would need to be written to cleanup intrs sent
+ * to a CPU while offline.
+ *
+ * Sending broadcast introduces lots of trouble in CPU hotplug situations.
+ * These IPI's are delivered to cpu's irrespective of their offline status
+ * and could pickup stale intr data when these CPUS are turned online.
+ *
+ * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
+ * the idea of not using broadcast IPI's anymore. Hence the run time check
+ * is introduced, on his request so we can choose an alternate mechanism.
+ *
+ * Initial wacky performance tests that collect cycle counts show
+ * no increase in using mask v.s broadcast version. In fact they seem
+ * identical in terms of cycle counts.
+ *
+ * if we need to use broadcast, we need to do the following.
+ *
+ * cli;
+ * hold call_lock;
+ * clear any pending IPI, just ack and clear all pending intr
+ * set cpu_online_map;
+ * release call_lock;
+ * sti;
+ *
+ * The complicated dummy irq processing shown above is not required if
+ * we didnt sent IPI's to wrong CPU's in the first place.
+ *
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI	(1)
+#else
+#define DEFAULT_SEND_IPI	(0)
+#endif
+
+static int no_broadcast=DEFAULT_SEND_IPI;
 
 static cpumask_t flat_target_cpus(void)
 {
@@ -79,28 +119,37 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
 	local_irq_restore(flags);
 }
 
-static void flat_send_IPI_allbutself(int vector)
+static inline void __local_flat_send_IPI_allbutself(int vector)
 {
-	cpumask_t mask;
-	/*
-	 * if there are no other CPUs in the system then
-	 * we get an APIC send error if we try to broadcast.
-	 * thus we have to avoid sending IPIs in this case.
-	 */
-	int this_cpu = get_cpu();
-
-	mask = cpu_online_map;
-	cpu_clear(this_cpu, mask);
+	if (no_broadcast) {
+		cpumask_t mask = cpu_online_map;
+		int this_cpu = get_cpu();
 
-	if (cpus_weight(mask) >= 1)
+		cpu_clear(this_cpu, mask);
 		flat_send_IPI_mask(mask, vector);
+		put_cpu();
+	}
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
 
-	put_cpu();
+static inline void __local_flat_send_IPI_all(int vector)
+{
+	if (no_broadcast)
+		flat_send_IPI_mask(cpu_online_map, vector);
+	else
+		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+	if (((num_online_cpus()) - 1) >= 1)
+		__local_flat_send_IPI_allbutself(vector);
 }
 
 static void flat_send_IPI_all(int vector)
 {
-	flat_send_IPI_mask(cpu_online_map, vector);
+	__local_flat_send_IPI_all(vector);
 }
 
 static int flat_apic_id_registered(void)
@@ -121,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb)
 	return ((ebx >> 24) & 0xFF) >> index_msb;
 }
 
+static __init int no_ipi_broadcast(char *str)
+{
+	get_option(&str, &no_broadcast);
+	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+											"IPI Broadcast");
+	return 1;
+}
+
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+
 struct genapic apic_flat =  {
 	.name = "flat",
 	.int_delivery_mode = dest_LowestPrio,
@@ -135,3 +194,12 @@ struct genapic apic_flat =  {
 	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
 };
+
+static int __init print_ipi_mode(void)
+{
+	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+											"Shortcut");
+	return 0;
+}
+
+late_initcall(print_ipi_mode);
-- 
cgit v1.2.3


From 343c3f642898cb2cb5c3e4f948e3e0a1bbc0351b Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:09 -0700
Subject: [PATCH] s-t-RAM: load gdt the right way

Sleep code uses wrong version of lgdt, that does the wrong thing when
gdt is beyond 16MB or so.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/acpi/wakeup.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd..185faa911db 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
 	shll	$4, %eax
 	addl	$(gdta - wakeup_code), %eax
 	movl	%eax, gdt_48a +2 - wakeup_code
-	lgdt	%ds:gdt_48a - wakeup_code		# load gdt with whatever is
+	lgdtl	%ds:gdt_48a - wakeup_code	# load gdt with whatever is
 						# appropriate
 
 	movl	$1, %eax			# protected mode (PE) bit
-- 
cgit v1.2.3


From 8d783b3e02002bce8cf9d4e4a82922ee7e59b1e5 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:14 -0700
Subject: [PATCH] swsusp: clean assembly parts

This patch fixes register saving so that each register is only saved once,
and adds missing saving of %cr8 on x86-64.  Some reordering so that
save/restore is more logical/safer (segment registers should be restored
after gdt).

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d65..6c0f402e3a8 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
 	 */
 	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
 	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("sldt %0" : "=m" (ctxt->ldt));
 	asm volatile ("str %0"  : "=m" (ctxt->tr));
 
 	/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
 	asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
 	asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
 	asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+	asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
 }
 
 void save_processor_state(void)
@@ -90,11 +90,19 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
+	asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
 	asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
 	asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
 	asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
 	asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
 
+	/*
+	 * now restore the descriptor tables to their proper values
+	 * ltr is done i fix_processor_context().
+	 */
+	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
 	/*
 	 * segment registers
 	 */
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_GS_BASE, ctxt->gs_base);
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 
-	/*
-	 * now restore the descriptor tables to their proper values
-	 * ltr is done i fix_processor_context().
-	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
-	asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
 	fix_processor_context();
 
 	do_fpu_end();
-- 
cgit v1.2.3


From b2b18660066997420b716c1881a6be8b82700d97 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sat, 25 Jun 2005 14:55:38 -0700
Subject: [PATCH] RCU: clean up a few remaining synchronize_kernel() calls

2.6.12-rc6-mm1 has a few remaining synchronize_kernel()s, some (but not
all) in comments.  This patch changes these synchronize_kernel() calls (and
comments) to synchronize_rcu() or synchronize_sched() as follows:

- arch/x86_64/kernel/mce.c mce_read(): change to synchronize_sched() to
  handle races with machine-check exceptions (synchronize_rcu() would not cut
  it given RCU implementations intended for hardcore realtime use.

- drivers/input/serio/i8042.c i8042_stop(): change to synchronize_sched() to
  handle races with i8042_interrupt() interrupt handler.  Again,
  synchronize_rcu() would not cut it given RCU implementations intended for
  hardcore realtime use.

- include/*/kdebug.h comments: change to synchronize_sched() to handle races
  with NMIs.  As before, synchronize_rcu() would not cut it...

- include/linux/list.h comment: change to synchronize_rcu(), since this
  comment is for list_del_rcu().

- security/keys/key.c unregister_key_type(): change to synchronize_rcu(),
  since this is interacting with RCU read side.

- security/keys/process_keys.c install_session_keyring(): change to
  synchronize_rcu(), since this is interacting with RCU read side.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 7ab15c8ab95..21e70625a49 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
 	memset(mcelog.entry, 0, next * sizeof(struct mce));
 	mcelog.next = 0;
 
-	synchronize_kernel();	
+	synchronize_sched();
 
 	/* Collect entries that were still getting written before the synchronize. */
 
-- 
cgit v1.2.3


From 70adada4282b52130ef31c7722d3fd8d94dacbc2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:42 -0700
Subject: [PATCH] kexec: x86_64: e820 64bit fix

From: Eric W. Biederman <ebiederm@xmission.com>

It is ok to reserve resources > 4G on x86_64 struct resource is 64bit now :)

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64..cf4878309b0 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -191,8 +191,6 @@ void __init e820_reserve_resources(void)
 	int i;
 	for (i = 0; i < e820.nr_map; i++) {
 		struct resource *res;
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
 		res = alloc_bootmem_low(sizeof(struct resource));
 		switch (e820.map[i].type) {
 		case E820_RAM:	res->name = "System RAM"; break;
-- 
cgit v1.2.3


From 719e711050482be667dafd39ec787859d353931c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:43 -0700
Subject: [PATCH] kexec: x86_64: add i8259 shutdown method

From: Eric W. Biederman <ebiederm@xmission.com

The following patch simply adds a shutdown method to the x86_64 i8259 code.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 19eafa0aa95..a8916909512 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
 	return 0;
 }
 
+static int i8259A_shutdown(struct sys_device *dev)
+{
+	/* Put the i8259A into a quiescent state that
+	 * the kernel initialization code can get it
+	 * out of.
+	 */
+	outb(0xff, 0x21);	/* mask all of 8259A-1 */
+	outb(0xff, 0xA1);	/* mask all of 8259A-1 */
+	return 0;
+}
+
 static struct sysdev_class i8259_sysdev_class = {
 	set_kset_name("i8259"),
 	.suspend = i8259A_suspend,
 	.resume = i8259A_resume,
+	.shutdown = i8259A_shutdown,
 };
 
 static struct sys_device device_i8259A = {
-- 
cgit v1.2.3


From 208fb93162d51faa69b9774fa7809858d84fd9dc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:45 -0700
Subject: [PATCH] kexec: x86_64: restore apic virtual wire mode on shutdown

When coming out of apic mode attempt to set the appropriate
apic back into virtual wire mode.  This improves on previous versions
of this patch by by never setting bot the local apic and the ioapic
into veritual wire mode.

This code looks at data from the mptable to see if an ioapic has
an ExtInt input to make this decision.  A future improvement
is to figure out which apic or ioapic was in virtual wire mode
at boot time and to remember it.  That is potentially a more accurate
method, of selecting which apic to place in virutal wire mode.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c    | 38 +++++++++++++++++++++++++++++++++++++-
 arch/x86_64/kernel/io_apic.c | 36 ++++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 2a6c893ccf6..375d369570c 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
 	}
 }
 
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
 {
 	if (pic_mode) {
 		/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
 	}
+	else {
+		/* Go back to Virtual Wire compatibility mode */
+		unsigned long value;
+
+		/* For the spurious interrupt use vector F, and enable it */
+		value = apic_read(APIC_SPIV);
+		value &= ~APIC_VECTOR_MASK;
+		value |= APIC_SPIV_APIC_ENABLED;
+		value |= 0xf;
+		apic_write_around(APIC_SPIV, value);
+
+		if (!virt_wire_setup) {
+			/* For LVT0 make it edge triggered, active high, external and enabled */
+			value = apic_read(APIC_LVT0);
+			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+			apic_write_around(APIC_LVT0, value);
+		}
+		else {
+			/* Disable LVT0 */
+			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+		}
+
+		/* For LVT1 make it edge triggered, active high, nmi and enabled */
+		value = apic_read(APIC_LVT1);
+		value &= ~(
+			APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+		apic_write_around(APIC_LVT1, value);
+	}
 }
 
 void disable_local_APIC(void)
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a..157190d986b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
 /*
  * Find the pin to which IRQ[irq] (ISA) is connected
  */
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
 {
 	int i;
 
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void)
  */
 void disable_IO_APIC(void)
 {
+	int pin;
 	/*
 	 * Clear the IO-APIC before rebooting:
 	 */
 	clear_IO_APIC();
 
-	disconnect_bsp_APIC();
+	/*
+	 * If the i82559 is routed through an IOAPIC
+	 * Put that IOAPIC in virtual wire mode
+	 * so legacy interrups can be delivered.
+	 */
+	pin = find_isa_irq_pin(0, mp_ExtINT);
+	if (pin != -1) {
+		struct IO_APIC_route_entry entry;
+		unsigned long flags;
+
+		memset(&entry, 0, sizeof(entry));
+		entry.mask            = 0; /* Enabled */
+		entry.trigger         = 0; /* Edge */
+		entry.irr             = 0;
+		entry.polarity        = 0; /* High */
+		entry.delivery_status = 0;
+		entry.dest_mode       = 0; /* Physical */
+		entry.delivery_mode   = 7; /* ExtInt */
+		entry.vector          = 0;
+		entry.dest.physical.physical_dest = 0;
+
+
+		/*
+		 * Add it to the IO-APIC irq-routing table:
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	disconnect_bsp_APIC(pin != -1);
 }
 
 /*
-- 
cgit v1.2.3


From 5ded01e83ec3b60191b03b9f88f53acd4e6112f5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:48 -0700
Subject: [PATCH] kexec: x86_64: vmlinux: fix physical addresses

The vmlinux on x86_64 does not report the correct physical address of
the kernel.  Instead in the physical address field it currently
reports the virtual address of the kernel.

This is patch is a bug fix that corrects vmlinux to report the
proper physical addresses.

This is potentially a help for crash dump analysis tools.

This definitiely allows bootloaders that load vmlinux as a standard
ELF executable.  Bootloaders directly loading vmlinux become of
practical importance when we consider the kexec on panic case.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/vmlinux.lds.S | 128 ++++++++++++++++++++++++++-------------
 1 file changed, 86 insertions(+), 42 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda8..73389f51c4e 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
  * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
  */
 
+#define LOAD_OFFSET __START_KERNEL_map
+
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
 #include <linux/config.h>
 
 OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 SECTIONS
 {
-  . = 0xffffffff80100000;
+  . = __START_KERNEL;
   phys_startup_64 = startup_64 - LOAD_OFFSET;
   _text = .;			/* Text and read-only data */
-  .text : {
+  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
-  .text.lock : { *(.text.lock) }	/* out-of-line lock text */
+  				/* out-of-line lock text */
+  .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
 
   _etext = .;			/* End of text section */
 
   . = ALIGN(16);		/* Exception table */
   __start___ex_table = .;
-  __ex_table : { *(__ex_table) }
+  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
   __stop___ex_table = .;
 
   RODATA
 
-  .data : {			/* Data */
+				/* Data */
+  .data : AT(ADDR(.data) - LOAD_OFFSET) {
 	*(.data)
 	CONSTRUCTORS
 	}
@@ -40,62 +45,95 @@ SECTIONS
   _edata = .;			/* End of data section */
 
   __bss_start = .;		/* BSS */
-  .bss : {
+  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 	*(.bss.page_aligned)	
 	*(.bss)
 	}
   __bss_end = .;
 
+  . = ALIGN(PAGE_SIZE);
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	*(.data.cacheline_aligned)
+  }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
 
-#define AFTER(x)      BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
-#define BINALIGN(x,y) (((x) + (y) - 1)  & ~((y) - 1))
-#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+  . = VSYSCALL_ADDR;
+  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
 
-  .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
-  __vsyscall_0 = LOADADDR(.vsyscall_0);
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
-  xtime_lock = LOADADDR(.xtime_lock);
-  .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
-  vxtime = LOADADDR(.vxtime);
-  .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
-  wall_jiffies = LOADADDR(.wall_jiffies);
-  .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
-  sys_tz = LOADADDR(.sys_tz);
-  .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
-  sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); 
-  .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
-  xtime = LOADADDR(.xtime);
+  .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
+  xtime_lock = VVIRT(.xtime_lock);
+
+  .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+  vxtime = VVIRT(.vxtime);
+
+  .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
+  wall_jiffies = VVIRT(.wall_jiffies);
+
+  .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
+  sys_tz = VVIRT(.sys_tz);
+
+  .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
+  sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
+
+  .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
+  xtime = VVIRT(.xtime);
+
   . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
-  jiffies = LOADADDR(.jiffies);
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
-  . = LOADADDR(.vsyscall_0) + 4096;
+  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+  jiffies = VVIRT(.jiffies);
+
+  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+  . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
 
   . = ALIGN(8192);		/* init_task */
-  .data.init_task : { *(.data.init_task) }
+  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+	*(.data.init_task)
+  }
 
   . = ALIGN(4096);
-  .data.page_aligned : { *(.data.page_aligned) }
+  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
+  }
 
   . = ALIGN(4096);		/* Init code and data */
   __init_begin = .;
-  .init.text : { 
+  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
 	_sinittext = .;
 	*(.init.text)
 	_einittext = .;
   }
   __initdata_begin = .;
-  .init.data : { *(.init.data) }
+  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
   __initdata_end = .;
   . = ALIGN(16);
   __setup_start = .;
-  .init.setup : { *(.init.setup) }
+  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
   __setup_end = .;
   __initcall_start = .;
-  .initcall.init : {
+  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
 	*(.initcall1.init) 
 	*(.initcall2.init) 
 	*(.initcall3.init) 
@@ -106,32 +144,38 @@ SECTIONS
   }
   __initcall_end = .;
   __con_initcall_start = .;
-  .con_initcall.init : { *(.con_initcall.init) }
+  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+	*(.con_initcall.init)
+  }
   __con_initcall_end = .;
   SECURITY_INIT
   . = ALIGN(8);
   __alt_instructions = .;
-  .altinstructions : { *(.altinstructions) } 
+  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	*(.altinstructions)
+  }
   __alt_instructions_end = .; 
- .altinstr_replacement : { *(.altinstr_replacement) }
+  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+	*(.altinstr_replacement)
+  }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
-  .exit.text : { *(.exit.text) }
-  .exit.data : { *(.exit.data) }	
+  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
   . = ALIGN(4096);
   __initramfs_start = .;
-  .init.ramfs : { *(.init.ramfs) }
+  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
   __initramfs_end = .;	
   . = ALIGN(32);
   __per_cpu_start = .;
-  .data.percpu  : { *(.data.percpu) }
+  .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
   __per_cpu_end = .;
   . = ALIGN(4096);
   __init_end = .;
 
   . = ALIGN(4096);
   __nosave_begin = .;
-  .data_nosave : { *(.data.nosave) }
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
   . = ALIGN(4096);
   __nosave_end = .;
 
-- 
cgit v1.2.3


From d0537508a9921efced238b20967e50e519ac34af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:52 -0700
Subject: [PATCH] kexec: x86_64: add CONFIG_PHYSICAL_START

For one kernel to report a crash another kernel has created we need
to have 2 kernels loaded simultaneously in memory.  To accomplish this
the two kernels need to built to run at different physical addresses.

This patch adds the CONFIG_PHYSICAL_START option to the x86_64 kernel
so we can do just that.  You need to know what you are doing and
the ramifications are before changing this value, and most users
won't care so I have made it depend on CONFIG_EMBEDDED

bzImage kernels will work and run at a different address when compiled
with this option but they will still load at 1MB.  If you need a kernel
loaded at a different address as well you need to boot a vmlinux.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/head.S | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81..8d765aa77a2 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
 	 */
 .org 0x1000
 ENTRY(init_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 
 .org 0x2000
 ENTRY(level3_ident_pgt)
-	.quad	0x0000000000104007
+	.quad	0x0000000000004007 + __PHYSICAL_START
 	.fill	511,8,0
 
 .org 0x3000
 ENTRY(level3_kernel_pgt)
 	.fill	510,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt */
 	.fill	1,8,0
 
 .org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
 
 .org 0xa000
 ENTRY(level3_physmem_pgt)
-	.quad	0x0000000000105007		/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+	.quad	0x0000000000005007 + __PHYSICAL_START	/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
 
 	.org 0xb000
 #ifdef CONFIG_ACPI_SLEEP
 ENTRY(wakeup_level4_pgt)
-	.quad	0x0000000000102007		/* -> level3_ident_pgt */
+	.quad	0x0000000000002007 + __PHYSICAL_START	/* -> level3_ident_pgt */
 	.fill	255,8,0
-	.quad	0x000000000010a007
+	.quad	0x000000000000a007 + __PHYSICAL_START
 	.fill	254,8,0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
-	.quad	0x0000000000103007		/* -> level3_kernel_pgt */
+	.quad	0x0000000000003007 + __PHYSICAL_START	/* -> level3_kernel_pgt */
 #endif
 
 	.data
-- 
cgit v1.2.3


From d89559589a588d1a654329d8cd9a3ad33aaad9be Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:58:02 -0700
Subject: [PATCH] kexec: x86_64: factor out apic shutdown code

Factor out the apic and smp shutdown code from machine_restart so it can be
called by in the kexec reboot path as well.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/reboot.c | 62 ++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762c..57e71dbdfd6 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str)
 
 __setup("reboot=", reboot_setup);
 
-#ifdef CONFIG_SMP
-static void smp_halt(void)
+static inline void kb_wait(void)
 {
-	int cpuid = safe_smp_processor_id(); 
-	static int first_entry = 1;
+	int i;
 
-	if (reboot_force)
-		return;
+	for (i=0; i<0x10000; i++)
+		if ((inb_p(0x64) & 0x02) == 0)
+			break;
+}
 
-	if (first_entry) {
-		first_entry = 0;
-		smp_call_function((void *)machine_restart, NULL, 1, 0);
-	}
-			
-	smp_stop_cpu(); 
+void machine_shutdown(void)
+{
+	/* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+	int reboot_cpu_id;
 
-	/* AP calling this. Just halt */
-	if (cpuid != boot_cpu_id) { 
-		for (;;) 
-			asm("hlt");
+	/* The boot cpu is always logical cpu 0 */
+	reboot_cpu_id = 0;
+
+	/* Make certain the cpu I'm about to reboot on is online */
+	if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+		reboot_cpu_id = smp_processor_id();
 	}
 
-	/* Wait for all other CPUs to have run smp_stop_cpu */
-	while (!cpus_empty(cpu_online_map))
-		rep_nop(); 
-}
+	/* Make certain I only run on the appropriate processor */
+	set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+	/* O.K Now that I'm on the appropriate processor,
+	 * stop all of the others.
+	 */
+	smp_send_stop();
 #endif
 
-static inline void kb_wait(void)
-{
-	int i;
+	local_irq_disable();
 
-	for (i=0; i<0x10000; i++)
-		if ((inb_p(0x64) & 0x02) == 0)
-			break;
+#ifndef CONFIG_SMP
+	disable_local_APIC();
+#endif
+
+	disable_IO_APIC();
+
+	local_irq_enable();
 }
 
 void machine_restart(char * __unused)
@@ -109,9 +115,7 @@ void machine_restart(char * __unused)
 
 	printk("machine restart\n");
 
-#ifdef CONFIG_SMP
-	smp_halt(); 
-#endif
+	machine_shutdown();
 
 	if (!reboot_force) {
 		local_irq_disable();
-- 
cgit v1.2.3


From 5234f5eb04abbbfa306ccfbc2ccbb6e73f515b15 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:58:02 -0700
Subject: [PATCH] kexec: x86_64 kexec implementation

This is the x86_64 implementation of machine kexec.  32bit compatibility
support has been implemented, and machine_kexec has been enhanced to not care
about the changing internal kernel paget table structures.

From: Alexander Nyberg <alexn@dsv.su.se>

      build fix

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/Makefile          |   1 +
 arch/x86_64/kernel/crash.c           |  38 ++++++
 arch/x86_64/kernel/machine_kexec.c   | 245 +++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/relocate_kernel.S | 143 ++++++++++++++++++++
 4 files changed, 427 insertions(+)
 create mode 100644 arch/x86_64/kernel/crash.c
 create mode 100644 arch/x86_64/kernel/machine_kexec.c
 create mode 100644 arch/x86_64/kernel/relocate_kernel.S

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fd..48f9e2c19cd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP)		+= smp.o smpboot.o trampoline.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o  nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o mpparse.o \
 		genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o crash.o
 obj-$(CONFIG_PM)		+= suspend.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 00000000000..7caf8a49d0c
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,38 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(void)
+{
+	/* This function is only called after the system
+	 * has paniced or is otherwise in a critical state.
+	 * The minimum amount of code to allow a kexec'd kernel
+	 * to run successfully needs to happen here.
+	 *
+	 * In practice this means shooting down the other cpus in
+	 * an SMP system.
+	 */
+}
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 00000000000..200b5993f8d
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,245 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(
+	u64 *level2p, unsigned long addr)
+{
+	unsigned long end_addr;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL2_SIZE;
+	while(addr < end_addr) {
+		*(level2p++) = addr | L1_ATTR;
+		addr += LEVEL1_SIZE;
+	}
+}
+
+static int init_level3_page(struct kimage *image,
+	u64 *level3p, unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL3_SIZE;
+	while((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level2p;
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level2p = (u64 *)page_address(page);
+		init_level2_page(level2p, addr);
+		*(level3p++) = __pa(level2p) | L2_ATTR;
+		addr += LEVEL2_SIZE;
+	}
+	/* clear the unused entries */
+	while(addr < end_addr) {
+		*(level3p++) = 0;
+		addr += LEVEL2_SIZE;
+	}
+out:
+	return result;
+}
+
+
+static int init_level4_page(struct kimage *image,
+	u64 *level4p, unsigned long addr, unsigned long last_addr)
+{
+	unsigned long end_addr;
+	int result;
+	result = 0;
+	addr &= PAGE_MASK;
+	end_addr = addr + LEVEL4_SIZE;
+	while((addr < last_addr) && (addr < end_addr)) {
+		struct page *page;
+		u64 *level3p;
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page) {
+			result = -ENOMEM;
+			goto out;
+		}
+		level3p = (u64 *)page_address(page);
+		result = init_level3_page(image, level3p, addr, last_addr);
+		if (result) {
+			goto out;
+		}
+		*(level4p++) = __pa(level3p) | L3_ATTR;
+		addr += LEVEL3_SIZE;
+	}
+	/* clear the unused entries */
+	while(addr < end_addr) {
+		*(level4p++) = 0;
+		addr += LEVEL3_SIZE;
+	}
+ out:
+	return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+	u64 *level4p;
+	level4p = (u64 *)__va(start_pgtable);
+	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+	unsigned char curidt[10];
+
+	/* x86-64 supports unaliged loads & stores */
+	(*(u16 *)(curidt)) = limit;
+	(*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+	__asm__ __volatile__ (
+		"lidt %0\n"
+		: "=m" (curidt)
+		);
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+	unsigned char curgdt[10];
+
+	/* x86-64 supports unaligned loads & stores */
+	(*(u16 *)(curgdt)) = limit;
+	(*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+	__asm__ __volatile__ (
+		"lgdt %0\n"
+		: "=m" (curgdt)
+		);
+};
+
+static void load_segments(void)
+{
+	__asm__ __volatile__ (
+		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
+		"\tmovl %eax,%ds\n"
+		"\tmovl %eax,%es\n"
+		"\tmovl %eax,%ss\n"
+		"\tmovl %eax,%fs\n"
+		"\tmovl %eax,%gs\n"
+		);
+#undef STR
+#undef __STR
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(
+	unsigned long indirection_page, unsigned long control_code_buffer,
+	unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+	unsigned long start_pgtable, control_code_buffer;
+	int result;
+
+	/* Calculate the offsets */
+	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Setup the identity mapped 64bit page table */
+	result = init_pgtable(image, start_pgtable);
+	if (result) {
+		return result;
+	}
+
+	/* Place the code in the reboot code buffer */
+	memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+
+	return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+	return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+	unsigned long page_list;
+	unsigned long control_code_buffer;
+	unsigned long start_pgtable;
+	relocate_new_kernel_t rnk;
+
+	/* Interrupts aren't acceptable while we reboot */
+	local_irq_disable();
+
+	/* Calculate the offsets */
+	page_list           = image->head;
+	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	control_code_buffer = start_pgtable + 4096UL;
+
+	/* Set the low half of the page table to my identity mapped
+	 * page table for kexec.  Leave the high half pointing at the
+	 * kernel pages.   Don't bother to flush the global pages
+	 * as that will happen when I fully switch to my identity mapped
+	 * page table anyway.
+	 */
+	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+	__flush_tlb();
+
+
+	/* The segment registers are funny things, they are
+	 * automatically loaded from a table, in memory wherever you
+	 * set them to a specific selector, but this table is never
+	 * accessed again unless you set the segment to a different selector.
+	 *
+	 * The more common model are caches where the behide
+	 * the scenes work is done, but is also dropped at arbitrary
+	 * times.
+	 *
+	 * I take advantage of this here by force loading the
+	 * segments, before I zap the gdt with an invalid value.
+	 */
+	load_segments();
+	/* The gdt & idt are now invalid.
+	 * If you want to load them you must set up your own idt & gdt.
+	 */
+	set_gdt(phys_to_virt(0),0);
+	set_idt(phys_to_virt(0),0);
+	/* now call it */
+	rnk = (relocate_new_kernel_t) control_code_buffer;
+	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..d24fa9b72a2
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+	/*
+	 * Must be relocatable PIC code callable as a C function, that once
+	 * it starts can not use the previous processes stack.
+	 */
+	.globl relocate_new_kernel
+	.code64
+relocate_new_kernel:
+	/* %rdi page_list
+	 * %rsi reboot_code_buffer
+	 * %rdx start address
+	 * %rcx page_table
+	 * %r8  arg5
+	 * %r9  arg6
+	 */
+
+	/* zero out flags, and disable interrupts */
+	pushq $0
+	popfq
+
+	/* set a new stack at the bottom of our page... */
+	lea   4096(%rsi), %rsp
+
+	/* store the parameters back on the stack */
+	pushq	%rdx /* store the start address */
+
+	/* Set cr0 to a known state:
+	 * 31 1 == Paging enabled
+	 * 18 0 == Alignment check disabled
+	 * 16 0 == Write protect disabled
+	 * 3  0 == No task switch
+	 * 2  0 == Don't do FP software emulation.
+	 * 0  1 == Proctected mode enabled
+	 */
+	movq	%cr0, %rax
+	andq	$~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+	orl	$((1<<31)|(1<<0)), %eax
+	movq	%rax, %cr0
+
+	/* Set cr4 to a known state:
+	 * 10 0 == xmm exceptions disabled
+	 * 9  0 == xmm registers instructions disabled
+	 * 8  0 == performance monitoring counter disabled
+	 * 7  0 == page global disabled
+	 * 6  0 == machine check exceptions disabled
+	 * 5  1 == physical address extension enabled
+	 * 4  0 == page size extensions	disabled
+	 * 3  0 == Debug extensions disabled
+	 * 2  0 == Time stamp disable (disabled)
+	 * 1  0 == Protected mode virtual interrupts disabled
+	 * 0  0 == VME disabled
+	 */
+
+	movq	$((1<<5)), %rax
+	movq	%rax, %cr4
+
+	jmp 1f
+1:
+
+	/* Switch to the identity mapped page tables,
+	 * and flush the TLB.
+	*/
+	movq	%rcx, %cr3
+
+	/* Do the copies */
+	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
+	xorq	%rdi, %rdi
+	xorq	%rsi, %rsi
+	jmp	1f
+
+0:	/* top, read another word for the indirection page */
+
+	movq	(%rbx), %rcx
+	addq	$8,	%rbx
+1:
+	testq	$0x1,	%rcx  /* is it a destination page? */
+	jz	2f
+	movq	%rcx,	%rdi
+	andq	$0xfffffffffffff000, %rdi
+	jmp	0b
+2:
+	testq	$0x2,	%rcx  /* is it an indirection page? */
+	jz	2f
+	movq	%rcx,   %rbx
+	andq	$0xfffffffffffff000, %rbx
+	jmp	0b
+2:
+	testq	$0x4,	%rcx  /* is it the done indicator? */
+	jz	2f
+	jmp	3f
+2:
+	testq	$0x8,	%rcx  /* is it the source indicator? */
+	jz	0b	      /* Ignore it otherwise */
+	movq	%rcx,   %rsi  /* For ever source page do a copy */
+	andq	$0xfffffffffffff000, %rsi
+
+	movq	$512,   %rcx
+	rep ; movsq
+	jmp	0b
+3:
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %rsp alone */
+
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+relocate_new_kernel_end:
+
+	.globl relocate_new_kernel_size
+relocate_new_kernel_size:
+	.quad relocate_new_kernel_end - relocate_new_kernel
-- 
cgit v1.2.3


From 5f5609df0c943b005847d3b2765d6dd47b624011 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:58:04 -0700
Subject: [PATCH] crashdump: x86_64: crashkernel option

This is the x86_64 implementation of the crashkernel option.  It reserves
a window of memory very early in the bootup process, so we never use
it for anything but the kernel to switch to when the running
kernel panics.

In addition to reserving this memory a resource structure is registered
so looking at /proc/iomem it is clear what happened to that memory.

ISSUES:
Is it possible to implement this in a architecture generic way?
What should be done with architectures that always use an iommu and
thus don't report their RAM memory resources in /proc/iomem?

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/e820.c  |  4 ++++
 arch/x86_64/kernel/setup.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index cf4878309b0..6ded3a50dfe 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
+#include <linux/kexec.h>
 #include <asm/page.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -210,6 +211,9 @@ void __init e820_reserve_resources(void)
 			 */
 			request_resource(res, &code_resource);
 			request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+			request_resource(res, &crashk_res);
+#endif
 		}
 	}
 }
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index ca9a915a6d4..b02d921da4f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -41,6 +41,7 @@
 #include <linux/kallsyms.h>
 #include <linux/edd.h>
 #include <linux/mmzone.h>
+#include <linux/kexec.h>
 
 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 		if (!memcmp(from, "noexec=", 7))
 			nonx_setup(from + 7);
 
+#ifdef CONFIG_KEXEC
+		/* crashkernel=size@addr specifies the location to reserve for
+		 * a crash kernel.  By reserving this memory we guarantee
+		 * that linux never set's it up as a DMA target.
+		 * Useful for holding code to do something appropriate
+		 * after a kernel panic.
+		 */
+		else if (!memcmp(from, "crashkernel=", 12)) {
+			unsigned long size, base;
+			size = memparse(from+12, &from);
+			if (*from == '@') {
+				base = memparse(from+1, &from);
+				/* FIXME: Do I want a sanity check
+				 * to validate the memory range?
+				 */
+				crashk_res.start = base;
+				crashk_res.end   = base + size - 1;
+			}
+		}
+#endif
+
 	next_char:
 		c = *(from++);
 		if (!c)
@@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	sparse_init();
+
+#ifdef CONFIG_KEXEC
+	if (crashk_res.start != crashk_res.end) {
+		reserve_bootmem(crashk_res.start,
+			crashk_res.end - crashk_res.start + 1);
+	}
+#endif
 	paging_init();
 
 	check_ioapic();
-- 
cgit v1.2.3


From 625f1c8219d95300ed32e4c67eb62a50ded095ba Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:12 -0700
Subject: [PATCH] Kdump: Export crash notes section address through sysfs

o Following patch exports kexec global variable "crash_notes" to user space
  through sysfs as kernel attribute in /sys/kernel.

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/crash.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 7caf8a49d0c..6183bcb8525 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -20,9 +20,6 @@
 #include <asm/nmi.h>
 #include <asm/hw_irq.h>
 
-#define MAX_NOTE_BYTES 1024
-typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
-
 note_buf_t crash_notes[NR_CPUS];
 
 void machine_crash_shutdown(void)
-- 
cgit v1.2.3


From 6e274d144302068a00794ec22e73520c0615cb6f Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@telia.com>
Date: Sat, 25 Jun 2005 14:58:26 -0700
Subject: [PATCH] kdump: Use real pt_regs from exception

Makes kexec_crashdump() take a pt_regs * as an argument.  This allows to
get exact register state at the point of the crash.  If we come from direct
panic assertion NULL will be passed and the current registers saved before
crashdump.

This hooks into two places:
die(): check the conditions under which we will panic when calling
do_exit and go there directly with the pt_regs that caused the fatal
fault.

die_nmi(): If we receive an NMI lockup while in the kernel use the
pt_regs and go directly to crash_kexec(). We're probably nested up badly
at this point so this might be the only chance to escape with proper
information.

Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/crash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 6183bcb8525..d7fa4248501 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -22,7 +22,7 @@
 
 note_buf_t crash_notes[NR_CPUS];
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
 	 * has paniced or is otherwise in a critical state.
-- 
cgit v1.2.3


From 72414d3f1d22fc3e311b162fca95c430048d38ce Mon Sep 17 00:00:00 2001
From: Maneesh Soni <maneesh@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:28 -0700
Subject: [PATCH] kexec code cleanup

o Following patch provides purely cosmetic changes and corrects CodingStyle
  guide lines related certain issues like below in kexec related files

  o braces for one line "if" statements, "for" loops,
  o more than 80 column wide lines,
  o No space after "while", "for" and "switch" key words

o Changes:
  o take-2: Removed the extra tab before "case" key words.
  o take-3: Put operator at the end of line and space before "*/"

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/machine_kexec.c | 49 +++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 22 deletions(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 200b5993f8d..60d1eff4156 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -32,29 +32,31 @@
 #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
-static void init_level2_page(
-	u64 *level2p, unsigned long addr)
+static void init_level2_page(u64 *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
+
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL2_SIZE;
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level2p++) = addr | L1_ATTR;
 		addr += LEVEL1_SIZE;
 	}
 }
 
-static int init_level3_page(struct kimage *image,
-	u64 *level3p, unsigned long addr, unsigned long last_addr)
+static int init_level3_page(struct kimage *image, u64 *level3p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL3_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level2p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -66,7 +68,7 @@ static int init_level3_page(struct kimage *image,
 		addr += LEVEL2_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level3p++) = 0;
 		addr += LEVEL2_SIZE;
 	}
@@ -75,17 +77,19 @@ out:
 }
 
 
-static int init_level4_page(struct kimage *image,
-	u64 *level4p, unsigned long addr, unsigned long last_addr)
+static int init_level4_page(struct kimage *image, u64 *level4p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL4_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level3p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -100,11 +104,11 @@ static int init_level4_page(struct kimage *image,
 		addr += LEVEL3_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level4p++) = 0;
 		addr += LEVEL3_SIZE;
 	}
- out:
+out:
 	return result;
 }
 
@@ -113,7 +117,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	u64 *level4p;
 	level4p = (u64 *)__va(start_pgtable);
-	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
 }
 
 static void set_idt(void *newidt, u16 limit)
@@ -159,9 +163,10 @@ static void load_segments(void)
 #undef __STR
 }
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long control_code_buffer,
-	unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+					unsigned long control_code_buffer,
+					unsigned long start_address,
+					unsigned long pgtable) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned long relocate_new_kernel_size;
@@ -172,17 +177,17 @@ int machine_kexec_prepare(struct kimage *image)
 	int result;
 
 	/* Calculate the offsets */
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Setup the identity mapped 64bit page table */
 	result = init_pgtable(image, start_pgtable);
-	if (result) {
+	if (result)
 		return result;
-	}
 
 	/* Place the code in the reboot code buffer */
-	memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+	memcpy(__va(control_code_buffer), relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	return 0;
 }
@@ -207,8 +212,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	local_irq_disable();
 
 	/* Calculate the offsets */
-	page_list           = image->head;
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	page_list = image->head;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Set the low half of the page table to my identity mapped
-- 
cgit v1.2.3


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86_64/kernel')

diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b7ba95d58..98590a989f3 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (try_to_freeze(0))
+	if (try_to_freeze())
 		goto no_signal;
 
 	if (!oldset)
-- 
cgit v1.2.3