aboutsummaryrefslogtreecommitdiff
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig91
-rw-r--r--arch/x86_64/Makefile2
-rw-r--r--arch/x86_64/boot/compressed/head.S11
-rw-r--r--arch/x86_64/boot/compressed/misc.c12
-rw-r--r--arch/x86_64/boot/install.sh6
-rw-r--r--arch/x86_64/boot/setup.S8
-rw-r--r--arch/x86_64/boot/tools/build.c5
-rw-r--r--arch/x86_64/ia32/ia32_signal.c15
-rw-r--r--arch/x86_64/ia32/ia32entry.S2
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S2
-rw-r--r--arch/x86_64/kernel/aperture.c4
-rw-r--r--arch/x86_64/kernel/apic.c46
-rw-r--r--arch/x86_64/kernel/crash.c35
-rw-r--r--arch/x86_64/kernel/e820.c6
-rw-r--r--arch/x86_64/kernel/early_printk.c13
-rw-r--r--arch/x86_64/kernel/genapic_flat.c110
-rw-r--r--arch/x86_64/kernel/head.S18
-rw-r--r--arch/x86_64/kernel/head64.c2
-rw-r--r--arch/x86_64/kernel/i387.c2
-rw-r--r--arch/x86_64/kernel/i8259.c27
-rw-r--r--arch/x86_64/kernel/io_apic.c36
-rw-r--r--arch/x86_64/kernel/irq.c29
-rw-r--r--arch/x86_64/kernel/kprobes.c190
-rw-r--r--arch/x86_64/kernel/machine_kexec.c250
-rw-r--r--arch/x86_64/kernel/mce.c10
-rw-r--r--arch/x86_64/kernel/mce_intel.c4
-rw-r--r--arch/x86_64/kernel/mpparse.c25
-rw-r--r--arch/x86_64/kernel/nmi.c4
-rw-r--r--arch/x86_64/kernel/process.c47
-rw-r--r--arch/x86_64/kernel/reboot.c62
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
-rw-r--r--arch/x86_64/kernel/setup.c80
-rw-r--r--arch/x86_64/kernel/setup64.c6
-rw-r--r--arch/x86_64/kernel/signal.c32
-rw-r--r--arch/x86_64/kernel/smp.c10
-rw-r--r--arch/x86_64/kernel/smpboot.c317
-rw-r--r--arch/x86_64/kernel/suspend.c18
-rw-r--r--arch/x86_64/kernel/time.c42
-rw-r--r--arch/x86_64/kernel/traps.c30
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S128
-rw-r--r--arch/x86_64/lib/delay.c7
-rw-r--r--arch/x86_64/mm/Makefile2
-rw-r--r--arch/x86_64/mm/fault.c4
-rw-r--r--arch/x86_64/mm/init.c9
-rw-r--r--arch/x86_64/mm/ioremap.c2
-rw-r--r--arch/x86_64/mm/numa.c10
-rw-r--r--arch/x86_64/pci/k8-bus.c16
48 files changed, 1552 insertions, 379 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 289f448ac89..d09437b5c48 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -207,33 +207,6 @@ config SMP
If you don't know what to do here, say N.
-config PREEMPT
- bool "Preemptible Kernel"
- ---help---
- This option reduces the latency of the kernel when reacting to
- real-time or interactive events by allowing a low priority process to
- be preempted even if it is in kernel mode executing a system call.
- This allows applications to run more reliably even when the system is
- under load. On contrary it may also break your drivers and add
- priority inheritance problems to your system. Don't select it if
- you rely on a stable system or have slightly obscure hardware.
- It's also not very well tested on x86-64 currently.
- You have been warned.
-
- Say Y here if you are feeling brave and building a kernel for a
- desktop, embedded or real-time system. Say N if you are unsure.
-
-config PREEMPT_BKL
- bool "Preempt The Big Kernel Lock"
- depends on PREEMPT
- default y
- help
- This option reduces the latency of the kernel by making the
- big kernel lock preemptible.
-
- Say Y here if you are building a kernel for a desktop system.
- Say N if you are unsure.
-
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on SMP
@@ -244,6 +217,8 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
+source "kernel/Kconfig.preempt"
+
config K8_NUMA
bool "K8 NUMA support"
select NUMA
@@ -265,7 +240,7 @@ config NUMA_EMU
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
-config DISCONTIGMEM
+config ARCH_DISCONTIGMEM_ENABLE
bool
depends on NUMA
default y
@@ -274,6 +249,27 @@ config NUMA
bool
default n
+config ARCH_DISCONTIGMEM_ENABLE
+ def_bool y
+ depends on NUMA
+
+config ARCH_DISCONTIGMEM_DEFAULT
+ def_bool y
+ depends on NUMA
+
+config ARCH_SPARSEMEM_ENABLE
+ def_bool y
+ depends on NUMA
+
+config ARCH_FLATMEM_ENABLE
+ def_bool y
+ depends on !NUMA
+
+source "mm/Kconfig"
+
+config HAVE_ARCH_EARLY_PFN_TO_NID
+ def_bool y
+
config HAVE_DEC_LOCK
bool
depends on SMP
@@ -292,6 +288,15 @@ config NR_CPUS
This is purely to save memory - each supported CPU requires
memory in the static kernel configuration.
+config HOTPLUG_CPU
+ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+ depends on SMP && HOTPLUG && EXPERIMENTAL
+ help
+ Say Y here to experiment with turning CPUs off and on. CPUs
+ can be controlled through /sys/devices/system/cpu/cpu#.
+ Say N if you want to disable CPU hotplug.
+
+
config HPET_TIMER
bool
default y
@@ -364,6 +369,34 @@ config X86_MCE_INTEL
Additional support for intel specific MCE features such as
the thermal monitor.
+config PHYSICAL_START
+ hex "Physical address where the kernel is loaded" if EMBEDDED
+ default "0x100000"
+ help
+ This gives the physical address where the kernel is loaded.
+ Primarily used in the case of kexec on panic where the
+ fail safe kernel needs to run at a different address than
+ the panic-ed kernel.
+
+ Don't change this unless you know what you are doing.
+
+config KEXEC
+ bool "kexec system call (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is indepedent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similiarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. It may help to enable device hotplugging
+ support. As of this writing the exact hardware interface is
+ strongly in flux, so no good recommendation can be made.
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
depends on PROC_FS
@@ -381,6 +414,8 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
+source kernel/Kconfig.hz
+
endmenu
#
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6f90c246c41..8a73794f9b9 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
LDFLAGS := -m elf_x86_64
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
-LDFLAGS_vmlinux := -e stext
+LDFLAGS_vmlinux :=
CHECKFLAGS += -D__x86_64__ -m64
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 27264dbd575..6f55565e4d4 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -2,8 +2,6 @@
* linux/boot/head.S
*
* Copyright (C) 1991, 1992, 1993 Linus Torvalds
- *
- * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $
*/
/*
@@ -21,13 +19,14 @@
*/
/*
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
.code32
.text
#include <linux/linkage.h>
#include <asm/segment.h>
+#include <asm/page.h>
.code32
.globl startup_32
@@ -77,7 +76,7 @@ startup_32:
jnz 3f
addl $8,%esp
xorl %ebx,%ebx
- ljmp $(__KERNEL_CS), $0x100000
+ ljmp $(__KERNEL_CS), $__PHYSICAL_START
/*
* We come here, if we were loaded high.
@@ -103,7 +102,7 @@ startup_32:
popl %ecx # lcount
popl %edx # high_buffer_start
popl %eax # hcount
- movl $0x100000,%edi
+ movl $__PHYSICAL_START,%edi
cli # make sure we don't get interrupted
ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
@@ -128,7 +127,7 @@ move_routine_start:
movsl
movl %ebx,%esi # Restore setup pointer
xorl %ebx,%ebx
- ljmp $(__KERNEL_CS), $0x100000
+ ljmp $(__KERNEL_CS), $__PHYSICAL_START
move_routine_end:
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index c8b9216f9e6..b38d5b8b5fb 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -11,6 +11,7 @@
#include "miscsetup.h"
#include <asm/io.h>
+#include <asm/page.h>
/*
* gzip declarations
@@ -92,8 +93,11 @@ static unsigned long output_ptr = 0;
static void *malloc(int size);
static void free(void *where);
+void* memset(void* s, int c, unsigned n);
+void* memcpy(void* dest, const void* src, unsigned n);
+
static void putstr(const char *);
-
+
extern int end;
static long free_mem_ptr = (long)&end;
static long free_mem_end_ptr;
@@ -284,7 +288,7 @@ void setup_normal_output_buffer(void)
#else
if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
#endif
- output_data = (char *)0x100000; /* Points to 1M */
+ output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */
free_mem_end_ptr = (long)real_mode;
}
@@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
low_buffer_size = low_buffer_end - LOW_BUFFER_START;
high_loaded = 1;
free_mem_end_ptr = (long)high_buffer_start;
- if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
- high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+ if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
+ high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
mv->hcount = 0; /* say: we need not to move high_buffer */
}
else mv->hcount = -1;
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
index 90f2452b3b9..198af15a775 100644
--- a/arch/x86_64/boot/install.sh
+++ b/arch/x86_64/boot/install.sh
@@ -1,6 +1,6 @@
#!/bin/sh
#
-# arch/i386/boot/install.sh
+# arch/x86_64/boot/install.sh
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
@@ -21,8 +21,8 @@
# User may have a custom install script
-if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
-if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi
+if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi
# Default install - same as make zlilo
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 75d4d2ad93b..ff58b2832b7 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -33,7 +33,7 @@
* Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
* <stiker@northlink.com>
*
- * Fix to work around buggy BIOSes which dont use carry bit correctly
+ * Fix to work around buggy BIOSes which don't use carry bit correctly
* and/or report extended memory in CX/DX for e801h memory size detection
* call. As a result the kernel got wrong figures. The int15/e801h docs
* from Ralf Brown interrupt list seem to indicate AX/BX should be used
@@ -383,7 +383,7 @@ sse_ok:
# a whole bunch of different types, and allows memory holes and
# everything. We scan through this memory map and build a list
# of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
+# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
#define SMAP 0x534d4150
@@ -436,7 +436,7 @@ bail820:
meme801:
stc # fix to work around buggy
- xorw %cx,%cx # BIOSes which dont clear/set
+ xorw %cx,%cx # BIOSes which don't clear/set
xorw %dx,%dx # carry on pass/error of
# e801h memory size call
# or merely pass cx,dx though
@@ -733,7 +733,7 @@ flush_instr:
#
# but we yet haven't reloaded the CS register, so the default size
# of the target offset still is 16 bit.
-# However, using an operant prefix (0x66), the CPU will properly
+# However, using an operand prefix (0x66), the CPU will properly
# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
# Manual, Mixing 16-bit and 32-bit code, page 16-6)
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index c2fa6631317..18b5bac1c42 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -1,6 +1,4 @@
/*
- * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
- *
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 1997 Martin Mares
*/
@@ -8,7 +6,8 @@
/*
* This file builds a disk-image from three different files:
*
- * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
+ * - bootsect: compatibility mbr which prints an error message if
+ * someone tries to boot the kernel directly.
* - setup: 8086 machine code, sets up system parm
* - system: 80386 code for actual system
*
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index fbd09b5126c..66e2821533d 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -428,8 +428,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
return (void __user *)((rsp - frame_size) & -8UL);
}
-void ia32_setup_frame(int sig, struct k_sigaction *ka,
- compat_sigset_t *set, struct pt_regs * regs)
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ compat_sigset_t *set, struct pt_regs * regs)
{
struct sigframe __user *frame;
int err = 0;
@@ -514,14 +514,15 @@ void ia32_setup_frame(int sig, struct k_sigaction *ka,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return;
+ return 1;
give_sigsegv:
force_sigsegv(sig, current);
+ return 0;
}
-void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- compat_sigset_t *set, struct pt_regs * regs)
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ compat_sigset_t *set, struct pt_regs * regs)
{
struct rt_sigframe __user *frame;
int err = 0;
@@ -613,9 +614,9 @@ void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return;
+ return 1;
give_sigsegv:
force_sigsegv(sig, current);
+ return 0;
}
-
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5..cc935427d53 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
.quad compat_sys_mq_timedreceive /* 280 */
.quad compat_sys_mq_notify
.quad compat_sys_mq_getsetattr
- .quad quiet_ni_syscall /* reserved for kexec */
+ .quad compat_sys_kexec_load /* reserved for kexec */
.quad compat_sys_waitid
.quad quiet_ni_syscall /* sys_altroot */
.quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fd..48f9e2c19cd 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd..185faa911db 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
shll $4, %eax
addl $(gdta - wakeup_code), %eax
movl %eax, gdt_48a +2 - wakeup_code
- lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
+ lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
# appropriate
movl $1, %eax # protected mode (PE) bit
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 504e6347499..c9a6b812e92 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -40,11 +40,7 @@ int fix_aperture __initdata = 1;
static u32 __init allocate_aperture(void)
{
-#ifdef CONFIG_DISCONTIGMEM
pg_data_t *nd0 = NODE_DATA(0);
-#else
- pg_data_t *nd0 = &contig_page_data;
-#endif
u32 aper_size;
void *p;
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd..375d369570c 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
}
}
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
{
if (pic_mode) {
/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
outb(0x70, 0x22);
outb(0x00, 0x23);
}
+ else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write_around(APIC_LVT0, value);
+ }
+ else {
+ /* Disable LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
}
void disable_local_APIC(void)
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void)
apic_write_around(APIC_LVT1, value);
}
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
{
unsigned int value, ver, maxlvt;
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = {
.cls = &lapic_sysclass,
};
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void)
local_irq_enable();
}
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
local_irq_disable(); /* FIXME: Do we need this? --RR */
setup_APIC_timer(calibration_result);
local_irq_enable();
}
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
{
if (using_apic_timer) {
unsigned long v;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 00000000000..d7fa4248501
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,35 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ /* This function is only called after the system
+ * has paniced or is otherwise in a critical state.
+ * The minimum amount of code to allow a kexec'd kernel
+ * to run successfully needs to happen here.
+ *
+ * In practice this means shooting down the other cpus in
+ * an SMP system.
+ */
+}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64..6ded3a50dfe 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/string.h>
+#include <linux/kexec.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -191,8 +192,6 @@ void __init e820_reserve_resources(void)
int i;
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
@@ -212,6 +211,9 @@ void __init e820_reserve_resources(void)
*/
request_resource(res, &code_resource);
request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+ request_resource(res, &crashk_res);
+#endif
}
}
}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index e3a19e8ebbf..9631c747c5e 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -2,20 +2,24 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
+#include <linux/tty.h>
#include <asm/io.h>
#include <asm/processor.h>
/* Simple VGA output */
#ifdef __i386__
+#include <asm/setup.h>
#define VGABASE (__ISA_IO_base + 0xb8000)
#else
+#include <asm/bootsetup.h>
#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
#endif
-#define MAX_YPOS 25
-#define MAX_XPOS 80
+#define MAX_YPOS max_ypos
+#define MAX_XPOS max_xpos
+static int max_ypos = 25, max_xpos = 80;
static int current_ypos = 1, current_xpos = 0;
static void early_vga_write(struct console *con, const char *str, unsigned n)
@@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt)
} else if (!strncmp(buf, "ttyS", 4)) {
early_serial_init(buf);
early_console = &early_serial_console;
- } else if (!strncmp(buf, "vga", 3)) {
+ } else if (!strncmp(buf, "vga", 3)
+ && SCREEN_INFO.orig_video_isVGA == 1) {
+ max_xpos = SCREEN_INFO.orig_video_cols;
+ max_ypos = SCREEN_INFO.orig_video_lines;
early_console = &early_vga_console;
}
early_console_initialized = 1;
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad0422..28284696508 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
+ * Ashok Raj <ashok.raj@intel.com>
+ * Removed IPI broadcast shortcut to support CPU hotplug
*/
#include <linux/config.h>
#include <linux/threads.h>
@@ -18,6 +20,46 @@
#include <asm/smp.h>
#include <asm/ipi.h>
+/*
+ * The following permit choosing broadcast IPI shortcut v.s sending IPI only
+ * to online cpus via the send_IPI_mask varient.
+ * The mask version is my preferred option, since it eliminates a lot of
+ * other extra code that would need to be written to cleanup intrs sent
+ * to a CPU while offline.
+ *
+ * Sending broadcast introduces lots of trouble in CPU hotplug situations.
+ * These IPI's are delivered to cpu's irrespective of their offline status
+ * and could pickup stale intr data when these CPUS are turned online.
+ *
+ * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
+ * the idea of not using broadcast IPI's anymore. Hence the run time check
+ * is introduced, on his request so we can choose an alternate mechanism.
+ *
+ * Initial wacky performance tests that collect cycle counts show
+ * no increase in using mask v.s broadcast version. In fact they seem
+ * identical in terms of cycle counts.
+ *
+ * if we need to use broadcast, we need to do the following.
+ *
+ * cli;
+ * hold call_lock;
+ * clear any pending IPI, just ack and clear all pending intr
+ * set cpu_online_map;
+ * release call_lock;
+ * sti;
+ *
+ * The complicated dummy irq processing shown above is not required if
+ * we didnt sent IPI's to wrong CPU's in the first place.
+ *
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+static int no_broadcast=DEFAULT_SEND_IPI;
static cpumask_t flat_target_cpus(void)
{
@@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void)
apic_write_around(APIC_LDR, val);
}
-static void flat_send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then
- * we get an APIC send error if we try to broadcast.
- * thus we have to avoid sending IPIs in this case.
- */
- if (num_online_cpus() > 1)
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
{
unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
local_irq_restore(flags);
}
+static inline void __local_flat_send_IPI_allbutself(int vector)
+{
+ if (no_broadcast) {
+ cpumask_t mask = cpu_online_map;
+ int this_cpu = get_cpu();
+
+ cpu_clear(this_cpu, mask);
+ flat_send_IPI_mask(mask, vector);
+ put_cpu();
+ }
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+
+static inline void __local_flat_send_IPI_all(int vector)
+{
+ if (no_broadcast)
+ flat_send_IPI_mask(cpu_online_map, vector);
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ if (((num_online_cpus()) - 1) >= 1)
+ __local_flat_send_IPI_allbutself(vector);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ __local_flat_send_IPI_all(vector);
+}
+
static int flat_apic_id_registered(void)
{
return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
@@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb)
return ((ebx >> 24) & 0xFF) >> index_msb;
}
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+ "IPI Broadcast");
+ return 1;
+}
+
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+
struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
@@ -125,3 +194,12 @@ struct genapic apic_flat = {
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
};
+
+static int __init print_ipi_mode(void)
+{
+ printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+ "Shortcut");
+ return 0;
+}
+
+late_initcall(print_ipi_mode);
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81..8d765aa77a2 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
*/
.org 0x1000
ENTRY(init_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
.org 0x2000
ENTRY(level3_ident_pgt)
- .quad 0x0000000000104007
+ .quad 0x0000000000004007 + __PHYSICAL_START
.fill 511,8,0
.org 0x3000
ENTRY(level3_kernel_pgt)
.fill 510,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad 0x0000000000105007 /* -> level2_kernel_pgt */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */
.fill 1,8,0
.org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
.org 0xa000
ENTRY(level3_physmem_pgt)
- .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
.org 0xb000
#ifdef CONFIG_ACPI_SLEEP
ENTRY(wakeup_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
#endif
.data
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 0f8c78dcd38..cf6ab147a2a 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
s = strstr(saved_command_line, "earlyprintk=");
if (s != NULL)
setup_early_printk(s);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
s = strstr(saved_command_line, "numa=");
if (s != NULL)
numa_setup(s+5);
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57c..d9b22b633e3 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 7873d9ba881..a8916909512 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq)
}
static struct hw_interrupt_type i8259A_irq_type = {
- "XT-PIC",
- startup_8259A_irq,
- shutdown_8259A_irq,
- enable_8259A_irq,
- disable_8259A_irq,
- mask_and_ack_8259A,
- end_8259A_irq,
- NULL
+ .typename = "XT-PIC",
+ .startup = startup_8259A_irq,
+ .shutdown = shutdown_8259A_irq,
+ .enable = enable_8259A_irq,
+ .disable = disable_8259A_irq,
+ .ack = mask_and_ack_8259A,
+ .end = end_8259A_irq,
};
/*
@@ -415,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
return 0;
}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-1 */
+ return 0;
+}
+
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a..157190d986b 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
{
int i;
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void)
*/
void disable_IO_APIC(void)
{
+ int pin;
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
- disconnect_bsp_APIC();
+ /*
+ * If the i82559 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrups can be delivered.
+ */
+ pin = find_isa_irq_pin(0, mp_ExtINT);
+ if (pin != -1) {
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = 7; /* ExtInt */
+ entry.vector = 0;
+ entry.dest.physical.physical_dest = 0;
+
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ disconnect_bsp_APIC(pin != -1);
}
/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb..cc3fb85f514 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
return 1;
}
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_affinity[irq], map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if (irq_desc[irq].action && !(warned++))
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+}
+#endif
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index f77f8a0ff18..4e680f87a75 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -27,6 +27,8 @@
* <prasanna@in.ibm.com> adapted for x86_64
* 2005-Mar Roland McGrath <roland@redhat.com>
* Fixed to handle %rip-relative addressing mode correctly.
+ * 2005-May Rusty Lynch <rusty.lynch@intel.com>
+ * Added function return probes functionality
*/
#include <linux/config.h>
@@ -37,18 +39,16 @@
#include <linux/slab.h>
#include <linux/preempt.h>
#include <linux/moduleloader.h>
-
+#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/kdebug.h>
static DECLARE_MUTEX(kprobe_mutex);
-/* kprobe_status settings */
-#define KPROBE_HIT_ACTIVE 0x00000001
-#define KPROBE_HIT_SS 0x00000002
-
static struct kprobe *current_kprobe;
static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct kprobe *kprobe_prev;
+static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
static struct pt_regs jprobe_saved_regs;
static long *jprobe_saved_rsp;
static kprobe_opcode_t *get_insn_slot(void);
@@ -214,6 +214,21 @@ void arch_copy_kprobe(struct kprobe *p)
BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
*ripdisp = disp;
}
+ p->opcode = *p->addr;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+ *p->addr = BREAKPOINT_INSTRUCTION;
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ *p->addr = p->opcode;
+ flush_icache_range((unsigned long) p->addr,
+ (unsigned long) p->addr + sizeof(kprobe_opcode_t));
}
void arch_remove_kprobe(struct kprobe *p)
@@ -223,10 +238,29 @@ void arch_remove_kprobe(struct kprobe *p)
down(&kprobe_mutex);
}
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+static inline void save_previous_kprobe(void)
{
- *p->addr = p->opcode;
- regs->rip = (unsigned long)p->addr;
+ kprobe_prev = current_kprobe;
+ kprobe_status_prev = kprobe_status;
+ kprobe_old_rflags_prev = kprobe_old_rflags;
+ kprobe_saved_rflags_prev = kprobe_saved_rflags;
+}
+
+static inline void restore_previous_kprobe(void)
+{
+ current_kprobe = kprobe_prev;
+ kprobe_status = kprobe_status_prev;
+ kprobe_old_rflags = kprobe_old_rflags_prev;
+ kprobe_saved_rflags = kprobe_saved_rflags_prev;
+}
+
+static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ current_kprobe = p;
+ kprobe_saved_rflags = kprobe_old_rflags
+ = (regs->eflags & (TF_MASK | IF_MASK));
+ if (is_IF_modifier(p->ainsn.insn))
+ kprobe_saved_rflags &= ~IF_MASK;
}
static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -240,6 +274,50 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
regs->rip = (unsigned long)p->ainsn.insn;
}
+struct task_struct *arch_get_kprobe_task(void *ptr)
+{
+ return ((struct thread_info *) (((unsigned long) ptr) &
+ (~(THREAD_SIZE -1))))->task;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+ unsigned long *sara = (unsigned long *)regs->rsp;
+ struct kretprobe_instance *ri;
+ static void *orig_ret_addr;
+
+ /*
+ * Save the return address when the return probe hits
+ * the first time, and use it to populate the (krprobe
+ * instance)->ret_addr for subsequent return probes at
+ * the same addrress since stack address would have
+ * the kretprobe_trampoline by then.
+ */
+ if (((void*) *sara) != kretprobe_trampoline)
+ orig_ret_addr = (void*) *sara;
+
+ if ((ri = get_free_rp_inst(rp)) != NULL) {
+ ri->rp = rp;
+ ri->stack_addr = sara;
+ ri->ret_addr = orig_ret_addr;
+ add_rp_inst(ri);
+ /* Replace the return addr with trampoline addr */
+ *sara = (unsigned long) &kretprobe_trampoline;
+ } else {
+ rp->nmissed++;
+ }
+}
+
+void arch_kprobe_flush_task(struct task_struct *tk)
+{
+ struct kretprobe_instance *ri;
+ while ((ri = get_rp_inst_tsk(tk)) != NULL) {
+ *((unsigned long *)(ri->stack_addr)) =
+ (unsigned long) ri->ret_addr;
+ recycle_rp_inst(ri);
+ }
+}
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled thorough out this function.
@@ -264,9 +342,30 @@ int kprobe_handler(struct pt_regs *regs)
regs->eflags |= kprobe_saved_rflags;
unlock_kprobes();
goto no_kprobe;
+ } else if (kprobe_status == KPROBE_HIT_SSDONE) {
+ /* TODO: Provide re-entrancy from
+ * post_kprobes_handler() and avoid exception
+ * stack corruption while single-stepping on
+ * the instruction of the new probe.
+ */
+ arch_disarm_kprobe(p);
+ regs->rip = (unsigned long)p->addr;
+ ret = 1;
+ } else {
+ /* We have reentered the kprobe_handler(), since
+ * another probe was hit while within the
+ * handler. We here save the original kprobe
+ * variables and just single step on instruction
+ * of the new probe without calling any user
+ * handlers.
+ */
+ save_previous_kprobe();
+ set_current_kprobe(p, regs);
+ p->nmissed++;
+ prepare_singlestep(p, regs);
+ kprobe_status = KPROBE_REENTER;
+ return 1;
}
- disarm_kprobe(p, regs);
- ret = 1;
} else {
p = current_kprobe;
if (p->break_handler && p->break_handler(p, regs)) {
@@ -296,11 +395,7 @@ int kprobe_handler(struct pt_regs *regs)
}
kprobe_status = KPROBE_HIT_ACTIVE;
- current_kprobe = p;
- kprobe_saved_rflags = kprobe_old_rflags
- = (regs->eflags & (TF_MASK | IF_MASK));
- if (is_IF_modifier(p->ainsn.insn))
- kprobe_saved_rflags &= ~IF_MASK;
+ set_current_kprobe(p, regs);
if (p->pre_handler && p->pre_handler(p, regs))
/* handler has already set things up, so skip ss setup */
@@ -317,6 +412,55 @@ no_kprobe:
}
/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ asm volatile ( ".global kretprobe_trampoline\n"
+ "kretprobe_trampoline: \n"
+ "nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct task_struct *tsk;
+ struct kretprobe_instance *ri;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ unsigned long *sara = (unsigned long *)regs->rsp - 1;
+
+ tsk = arch_get_kprobe_task(sara);
+ head = kretprobe_inst_table_head(tsk);
+
+ hlist_for_each_entry(ri, node, head, hlist) {
+ if (ri->stack_addr == sara && ri->rp) {
+ if (ri->rp->handler)
+ ri->rp->handler(ri, regs);
+ }
+ }
+ return 0;
+}
+
+void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
+{
+ struct kretprobe_instance *ri;
+ /* RA already popped */
+ unsigned long *sara = ((unsigned long *)regs->rsp) - 1;
+
+ while ((ri = get_rp_inst(sara))) {
+ regs->rip = (unsigned long)ri->ret_addr;
+ recycle_rp_inst(ri);
+ }
+ regs->eflags &= ~TF_MASK;
+}
+
+/*
* Called after single-stepping. p->addr is the address of the
* instruction whose first byte has been replaced by the "int 3"
* instruction. To avoid the SMP problems that can occur when we
@@ -401,13 +545,23 @@ int post_kprobe_handler(struct pt_regs *regs)
if (!kprobe_running())
return 0;
- if (current_kprobe->post_handler)
+ if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
+ kprobe_status = KPROBE_HIT_SSDONE;
current_kprobe->post_handler(current_kprobe, regs, 0);
+ }
- resume_execution(current_kprobe, regs);
+ if (current_kprobe->post_handler != trampoline_post_handler)
+ resume_execution(current_kprobe, regs);
regs->eflags |= kprobe_saved_rflags;
- unlock_kprobes();
+ /* Restore the original saved kprobes variables and continue. */
+ if (kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe();
+ goto out;
+ } else {
+ unlock_kprobes();
+ }
+out:
preempt_enable_no_resched();
/*
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 00000000000..60d1eff4156
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,250 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(u64 *level2p, unsigned long addr)
+{
+ unsigned long end_addr;
+
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL2_SIZE;
+ while (addr < end_addr) {
+ *(level2p++) = addr | L1_ATTR;
+ addr += LEVEL1_SIZE;
+ }
+}
+
+static int init_level3_page(struct kimage *image, u64 *level3p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL3_SIZE;
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ u64 *level2p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level2p = (u64 *)page_address(page);
+ init_level2_page(level2p, addr);
+ *(level3p++) = __pa(level2p) | L2_ATTR;
+ addr += LEVEL2_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ *(level3p++) = 0;
+ addr += LEVEL2_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_level4_page(struct kimage *image, u64 *level4p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL4_SIZE;
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ u64 *level3p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level3p = (u64 *)page_address(page);
+ result = init_level3_page(image, level3p, addr, last_addr);
+ if (result) {
+ goto out;
+ }
+ *(level4p++) = __pa(level3p) | L3_ATTR;
+ addr += LEVEL3_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ *(level4p++) = 0;
+ addr += LEVEL3_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ u64 *level4p;
+ level4p = (u64 *)__va(start_pgtable);
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+ unsigned char curidt[10];
+
+ /* x86-64 supports unaliged loads & stores */
+ (*(u16 *)(curidt)) = limit;
+ (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+ __asm__ __volatile__ (
+ "lidt %0\n"
+ : "=m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+ unsigned char curgdt[10];
+
+ /* x86-64 supports unaligned loads & stores */
+ (*(u16 *)(curgdt)) = limit;
+ (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+ __asm__ __volatile__ (
+ "lgdt %0\n"
+ : "=m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+ "\tmovl %eax,%ds\n"
+ "\tmovl %eax,%es\n"
+ "\tmovl %eax,%ss\n"
+ "\tmovl %eax,%fs\n"
+ "\tmovl %eax,%gs\n"
+ );
+#undef STR
+#undef __STR
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+ unsigned long control_code_buffer,
+ unsigned long start_address,
+ unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable, control_code_buffer;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+ /* Place the code in the reboot code buffer */
+ memcpy(__va(control_code_buffer), relocate_new_kernel,
+ relocate_new_kernel_size);
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list;
+ unsigned long control_code_buffer;
+ unsigned long start_pgtable;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* Calculate the offsets */
+ page_list = image->head;
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Set the low half of the page table to my identity mapped
+ * page table for kexec. Leave the high half pointing at the
+ * kernel pages. Don't bother to flush the global pages
+ * as that will happen when I fully switch to my identity mapped
+ * page table anyway.
+ */
+ memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+ __flush_tlb();
+
+
+ /* The segment registers are funny things, they are
+ * automatically loaded from a table, in memory wherever you
+ * set them to a specific selector, but this table is never
+ * accessed again unless you set the segment to a different selector.
+ *
+ * The more common model are caches where the behide
+ * the scenes work is done, but is also dropped at arbitrary
+ * times.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /* The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
+ /* now call it */
+ rnk = (relocate_new_kernel_t) control_code_buffer;
+ (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f..21e70625a49 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
}
/* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
}
}
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
memset(mcelog.entry, 0, next * sizeof(struct mce));
mcelog.next = 0;
- synchronize_kernel();
+ synchronize_sched();
/* Collect entries that were still getting written before the synchronize. */
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
-static __init int mce_init_device(void)
+static __cpuinit int mce_init_device(void)
{
int err;
if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069..0be0a795981 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
irq_exit();
}
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
{
u32 l, h;
int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 61a63be6b29..9c5aa2a790c 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -23,6 +23,7 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi.h>
+#include <linux/module.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
@@ -45,7 +46,8 @@ int acpi_found_madt;
int apic_version [MAX_APICS];
unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL };
+unsigned char pci_bus_to_node [256];
+EXPORT_SYMBOL(pci_bus_to_node);
static int mp_current_pci_id = 0;
/* I/O APIC entries */
@@ -904,11 +906,20 @@ void __init mp_config_acpi_legacy_irqs (void)
return;
}
+#define MAX_GSI_NUM 4096
+
int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
{
int ioapic = -1;
int ioapic_pin = 0;
int idx, bit = 0;
+ static int pci_irq = 16;
+ /*
+ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, to the IRQs
+ * assigned to actual devices.
+ */
+ static int gsi_to_irq[MAX_GSI_NUM];
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -943,11 +954,21 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low)
if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
- return gsi;
+ return gsi_to_irq[gsi];
}
mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+ if (edge_level) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+ */
+ int irq = gsi;
+ gsi = pci_irq++;
+ gsi_to_irq[irq] = gsi;
+ }
+
io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac9..4e44d6e6b7e 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
}
/* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index dce8bab4306..1d91271796e 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
*
* X86-64 port
* Andi Kleen.
- *
+ *
+ * CPU hotplug support - ashok.raj@intel.com
* $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
*/
@@ -18,6 +19,7 @@
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -34,6 +36,7 @@
#include <linux/ptrace.h>
#include <linux/utsname.h>
#include <linux/random.h>
+#include <linux/kprobes.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -153,6 +156,29 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ wbinvd();
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ while (1)
+ safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
@@ -173,6 +199,8 @@ void cpu_idle (void)
idle = pm_idle;
if (!idle)
idle = default_idle;
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
idle();
}
@@ -203,7 +231,7 @@ static void mwait_idle(void)
}
}
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
@@ -293,6 +321,14 @@ void exit_thread(void)
{
struct task_struct *me = current;
struct thread_struct *t = &me->thread;
+
+ /*
+ * Remove function-return probe instances associated with this task
+ * and put them back on the free list. Do not insert an exit probe for
+ * this function, it will be disabled by kprobe_flush_task if you do.
+ */
+ kprobe_flush_task(me);
+
if (me->thread.io_bitmap_ptr) {
struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
@@ -312,6 +348,13 @@ void flush_thread(void)
struct task_struct *tsk = current;
struct thread_info *t = current_thread_info();
+ /*
+ * Remove function-return probe instances associated with this task
+ * and put them back on the free list. Do not insert an exit probe for
+ * this function, it will be disabled by kprobe_flush_task if you do.
+ */
+ kprobe_flush_task(tsk);
+
if (t->flags & _TIF_ABI_PENDING)
t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762c..57e71dbdfd6 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str)
__setup("reboot=", reboot_setup);
-#ifdef CONFIG_SMP
-static void smp_halt(void)
+static inline void kb_wait(void)
{
- int cpuid = safe_smp_processor_id();
- static int first_entry = 1;
+ int i;
- if (reboot_force)
- return;
+ for (i=0; i<0x10000; i++)
+ if ((inb_p(0x64) & 0x02) == 0)
+ break;
+}
- if (first_entry) {
- first_entry = 0;
- smp_call_function((void *)machine_restart, NULL, 1, 0);
- }
-
- smp_stop_cpu();
+void machine_shutdown(void)
+{
+ /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+ int reboot_cpu_id;
- /* AP calling this. Just halt */
- if (cpuid != boot_cpu_id) {
- for (;;)
- asm("hlt");
+ /* The boot cpu is always logical cpu 0 */
+ reboot_cpu_id = 0;
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+ reboot_cpu_id = smp_processor_id();
}
- /* Wait for all other CPUs to have run smp_stop_cpu */
- while (!cpus_empty(cpu_online_map))
- rep_nop();
-}
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+ /* O.K Now that I'm on the appropriate processor,
+ * stop all of the others.
+ */
+ smp_send_stop();
#endif
-static inline void kb_wait(void)
-{
- int i;
+ local_irq_disable();
- for (i=0; i<0x10000; i++)
- if ((inb_p(0x64) & 0x02) == 0)
- break;
+#ifndef CONFIG_SMP
+ disable_local_APIC();
+#endif
+
+ disable_IO_APIC();
+
+ local_irq_enable();
}
void machine_restart(char * __unused)
@@ -109,9 +115,7 @@ void machine_restart(char * __unused)
printk("machine restart\n");
-#ifdef CONFIG_SMP
- smp_halt();
-#endif
+ machine_shutdown();
if (!reboot_force) {
local_irq_disable();
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 00000000000..d24fa9b72a2
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+ /*
+ * Must be relocatable PIC code callable as a C function, that once
+ * it starts can not use the previous processes stack.
+ */
+ .globl relocate_new_kernel
+ .code64
+relocate_new_kernel:
+ /* %rdi page_list
+ * %rsi reboot_code_buffer
+ * %rdx start address
+ * %rcx page_table
+ * %r8 arg5
+ * %r9 arg6
+ */
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* set a new stack at the bottom of our page... */
+ lea 4096(%rsi), %rsp
+
+ /* store the parameters back on the stack */
+ pushq %rdx /* store the start address */
+
+ /* Set cr0 to a known state:
+ * 31 1 == Paging enabled
+ * 18 0 == Alignment check disabled
+ * 16 0 == Write protect disabled
+ * 3 0 == No task switch
+ * 2 0 == Don't do FP software emulation.
+ * 0 1 == Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+ orl $((1<<31)|(1<<0)), %eax
+ movq %rax, %cr0
+
+ /* Set cr4 to a known state:
+ * 10 0 == xmm exceptions disabled
+ * 9 0 == xmm registers instructions disabled
+ * 8 0 == performance monitoring counter disabled
+ * 7 0 == page global disabled
+ * 6 0 == machine check exceptions disabled
+ * 5 1 == physical address extension enabled
+ * 4 0 == page size extensions disabled
+ * 3 0 == Debug extensions disabled
+ * 2 0 == Time stamp disable (disabled)
+ * 1 0 == Protected mode virtual interrupts disabled
+ * 0 0 == VME disabled
+ */
+
+ movq $((1<<5)), %rax
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Switch to the identity mapped page tables,
+ * and flush the TLB.
+ */
+ movq %rcx, %cr3
+
+ /* Do the copies */
+ movq %rdi, %rcx /* Put the page_list in %rcx */
+ xorq %rdi, %rdi
+ xorq %rsi, %rsi
+ jmp 1f
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+1:
+ testq $0x1, %rcx /* is it a destination page? */
+ jz 2f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+2:
+ testq $0x2, %rcx /* is it an indirection page? */
+ jz 2f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+2:
+ testq $0x4, %rcx /* is it the done indicator? */
+ jz 2f
+ jmp 3f
+2:
+ testq $0x8, %rcx /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq $512, %rcx
+ rep ; movsq
+ jmp 0b
+3:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %rsp alone */
+
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 99f038ede23..b02d921da4f 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -40,6 +40,9 @@
#include <linux/acpi.h>
#include <linux/kallsyms.h>
#include <linux/edd.h>
+#include <linux/mmzone.h>
+#include <linux/kexec.h>
+
#include <asm/mtrr.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -348,7 +351,7 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (!memcmp(from, "mem=", 4))
parse_memopt(from+4, &from);
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
if (!memcmp(from, "numa=", 5))
numa_setup(from+5);
#endif
@@ -365,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (!memcmp(from, "noexec=", 7))
nonx_setup(from + 7);
+#ifdef CONFIG_KEXEC
+ /* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+ else if (!memcmp(from, "crashkernel=", 12)) {
+ unsigned long size, base;
+ size = memparse(from+12, &from);
+ if (*from == '@') {
+ base = memparse(from+1, &from);
+ /* FIXME: Do I want a sanity check
+ * to validate the memory range?
+ */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ }
+#endif
+
next_char:
c = *(from++);
if (!c)
@@ -377,17 +401,20 @@ static __init void parse_cmdline_early (char ** cmdline_p)
*cmdline_p = command_line;
}
-#ifndef CONFIG_DISCONTIGMEM
-static void __init contig_initmem_init(void)
+#ifndef CONFIG_NUMA
+static void __init
+contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long bootmap_size, bootmap;
- bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
- if (bootmap == -1L)
- panic("Cannot find bootmem map of size %ld\n",bootmap_size);
- bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size);
+ unsigned long bootmap_size, bootmap;
+
+ memory_present(0, start_pfn, end_pfn);
+ bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n",bootmap_size);
+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
+ e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+ reserve_bootmem(bootmap, bootmap_size);
}
#endif
@@ -554,10 +581,10 @@ void __init setup_arch(char **cmdline_p)
acpi_numa_init();
#endif
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
numa_initmem_init(0, end_pfn);
#else
- contig_initmem_init();
+ contig_initmem_init(0, end_pfn);
#endif
/* Reserve direct mapping */
@@ -618,6 +645,15 @@ void __init setup_arch(char **cmdline_p)
}
}
#endif
+
+ sparse_init();
+
+#ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end) {
+ reserve_bootmem(crashk_res.start,
+ crashk_res.end - crashk_res.start + 1);
+ }
+#endif
paging_init();
check_ioapic();
@@ -669,7 +705,7 @@ void __init setup_arch(char **cmdline_p)
#endif
}
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
@@ -685,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
}
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, eax, ebx, ecx, edx;
@@ -796,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
return r;
}
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
@@ -857,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
/*
* find out the number of processor cores on the die
*/
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax;
@@ -875,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
return 1;
}
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned n;
@@ -895,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
c->x86_num_cores = intel_num_cpu_cores(c);
}
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
@@ -916,7 +952,7 @@ struct cpu_model_info {
/* Do some early cpuid on the boot CPU to get some parameter that are
needed before check_bugs. Everything advanced is in identify_cpu
below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
{
u32 tfms;
@@ -970,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
u32 xlvl;
@@ -1047,7 +1083,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
}
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8..f2b9c6bc999 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
@@ -171,7 +171,7 @@ void syscall_init(void)
wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
}
-void __init check_efer(void)
+void __cpuinit check_efer(void)
{
unsigned long efer;
@@ -188,7 +188,7 @@ void __init check_efer(void)
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
{
#ifdef CONFIG_SMP
int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 429c0269dc4..98590a989f3 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -34,9 +34,9 @@
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
-void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs);
-void ia32_setup_frame(int sig, struct k_sigaction *ka,
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs);
asmlinkage long
@@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
return (void __user *)round_down(rsp - size, 16);
}
-static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs)
{
struct rt_sigframe __user *frame;
@@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return;
+ return 1;
give_sigsegv:
force_sigsegv(sig, current);
+ return 0;
}
/*
* OK, we're invoking a handler
*/
-static void
+static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
sigset_t *oldset, struct pt_regs *regs)
{
+ int ret;
+
#ifdef DEBUG_SIG
printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n",
current->pid, sig,
@@ -384,20 +387,22 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
#ifdef CONFIG_IA32_EMULATION
if (test_thread_flag(TIF_IA32)) {
if (ka->sa.sa_flags & SA_SIGINFO)
- ia32_setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
- ia32_setup_frame(sig, ka, oldset, regs);
+ ret = ia32_setup_frame(sig, ka, oldset, regs);
} else
#endif
- setup_rt_frame(sig, ka, info, oldset, regs);
+ ret = setup_rt_frame(sig, ka, info, oldset, regs);
- if (!(ka->sa.sa_flags & SA_NODEFER)) {
+ if (ret && !(ka->sa.sa_flags & SA_NODEFER)) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
sigaddset(&current->blocked,sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
}
+
+ return ret;
}
/*
@@ -417,10 +422,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* kernel mode. Just return without doing anything
* if so.
*/
- if ((regs->cs & 3) != 3)
+ if (!user_mode(regs))
return 1;
- if (try_to_freeze(0))
+ if (try_to_freeze())
goto no_signal;
if (!oldset)
@@ -434,11 +439,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* inside the kernel.
*/
if (current->thread.debugreg7)
- asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7));
+ set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */
- handle_signal(signr, &info, &ka, oldset, regs);
- return 1;
+ return handle_signal(signr, &info, &ka, oldset, regs);
}
no_signal:
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1..ccae392886a 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
static struct call_data_struct * call_data;
+void lock_ipi_call_lock(void)
+{
+ spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+ spin_unlock_irq(&call_lock);
+}
+
/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f34594..b969ee12872 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
* Andi Kleen : Converted to new state machine.
* Various cleanups.
* Probably mostly hotplug CPU ready now.
+ * Ashok Raj : CPU hotplug support
*/
@@ -58,11 +59,6 @@
#include <asm/proto.h>
#include <asm/nmi.h>
-/* Change for real CPU hotplug. Note other files need to be fixed
- first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
@@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
extern unsigned char trampoline_data[];
extern unsigned char trampoline_end[];
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x) (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
+#else
+#define fixup_cpu_possible_map(x)
+#endif
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -418,6 +445,33 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+static inline void set_cpu_sibling_map(int cpu)
+{
+ int i;
+
+ if (smp_num_siblings > 1) {
+ for_each_cpu(i) {
+ if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ cpu_set(i, cpu_sibling_map[cpu]);
+ cpu_set(cpu, cpu_sibling_map[i]);
+ }
+ }
+ } else {
+ cpu_set(cpu, cpu_sibling_map[cpu]);
+ }
+
+ if (current_cpu_data.x86_num_cores > 1) {
+ for_each_cpu(i) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ }
+ }
+ } else {
+ cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ }
+}
+
/*
* Setup code on secondary processor (after comming out of the trampoline)
*/
@@ -448,9 +502,28 @@ void __cpuinit start_secondary(void)
enable_APIC_timer();
/*
+ * The sibling maps must be set before turing the online map on for
+ * this cpu
+ */
+ set_cpu_sibling_map(smp_processor_id());
+
+ /*
+ * We need to hold call_lock, so there is no inconsistency
+ * between the time smp_call_function() determines number of
+ * IPI receipients, and the time when the determination is made
+ * for which cpus receive the IPI in genapic_flat.c. Holding this
+ * lock helps us to not include this cpu in a currently in progress
+ * smp_call_function().
+ */
+ lock_ipi_call_lock();
+
+ /*
* Allow the master to continue.
*/
cpu_set(smp_processor_id(), cpu_online_map);
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ unlock_ipi_call_lock();
+
mb();
/* Wait for TSC sync to not schedule things before.
@@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
return (send_status | accept_status);
}
+struct create_idle {
+ struct task_struct *idle;
+ struct completion done;
+ int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+ struct create_idle *c_idle = _c_idle;
+
+ c_idle->idle = fork_idle(c_idle->cpu);
+ complete(&c_idle->done);
+}
+
/*
* Boot one CPU.
*/
static int __cpuinit do_boot_cpu(int cpu, int apicid)
{
- struct task_struct *idle;
unsigned long boot_error;
int timeout;
unsigned long start_rip;
+ struct create_idle c_idle = {
+ .cpu = cpu,
+ .done = COMPLETION_INITIALIZER(c_idle.done),
+ };
+ DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+ c_idle.idle = get_idle_for_cpu(cpu);
+
+ if (c_idle.idle) {
+ c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+ init_idle(c_idle.idle, cpu);
+ goto do_rest;
+ }
+
/*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
+ * During cold boot process, keventd thread is not spun up yet.
+ * When we do cpu hot-add, we create idle threads on the fly, we should
+ * not acquire any attributes from the calling context. Hence the clean
+ * way to create kernel_threads() is to do that from keventd().
+ * We do the current_is_keventd() due to the fact that ACPI notifier
+ * was also queuing to keventd() and when the caller is already running
+ * in context of keventd(), we would end up with locking up the keventd
+ * thread.
*/
- idle = fork_idle(cpu);
- if (IS_ERR(idle)) {
+ if (!keventd_up() || current_is_keventd())
+ work.func(work.data);
+ else {
+ schedule_work(&work);
+ wait_for_completion(&c_idle.done);
+ }
+
+ if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
- return PTR_ERR(idle);
+ return PTR_ERR(c_idle.idle);
}
- cpu_pda[cpu].pcurrent = idle;
+ set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+ cpu_pda[cpu].pcurrent = c_idle.idle;
start_rip = setup_trampoline();
- init_rsp = idle->thread.rsp;
+ init_rsp = c_idle.idle->thread.rsp;
per_cpu(init_tss,cpu).rsp0 = init_rsp;
initial_code = start_secondary;
- clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+ clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
start_rip, init_rsp);
@@ -746,51 +863,6 @@ cycles_t cacheflush_time;
unsigned long cache_decay_ticks;
/*
- * Construct cpu_sibling_map[], so that we can tell the sibling CPU
- * on SMT systems efficiently.
- */
-static __cpuinit void detect_siblings(void)
-{
- int cpu;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpus_clear(cpu_sibling_map[cpu]);
- cpus_clear(cpu_core_map[cpu]);
- }
-
- for_each_online_cpu (cpu) {
- struct cpuinfo_x86 *c = cpu_data + cpu;
- int siblings = 0;
- int i;
- if (smp_num_siblings > 1) {
- for_each_online_cpu (i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
- siblings++;
- cpu_set(i, cpu_sibling_map[cpu]);
- }
- }
- } else {
- siblings++;
- cpu_set(cpu, cpu_sibling_map[cpu]);
- }
-
- if (siblings != smp_num_siblings) {
- printk(KERN_WARNING
- "WARNING: %d siblings found for CPU%d, should be %d\n",
- siblings, cpu, smp_num_siblings);
- smp_num_siblings = siblings;
- }
- if (c->x86_num_cores > 1) {
- for_each_online_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i])
- cpu_set(i, cpu_core_map[cpu]);
- }
- } else
- cpu_core_map[cpu] = cpu_sibling_map[cpu];
- }
-}
-
-/*
* Cleanup possible dangling ends...
*/
static __cpuinit void smp_cleanup_boot(void)
@@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void)
*
* RED-PEN audit/test this more. I bet there is more state messed up here.
*/
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
{
cpu_present_map = cpumask_of_cpu(0);
cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void)
/*
* Handle user cpus=... parameter.
*/
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
{
int i, k;
k = 0;
@@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
/*
* Various sanity checks.
*/
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
{
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
* Prepare for SMP bootup. The MP table or ACPI has been read
* earlier. Just do some sanity checking here and enable APIC mode.
*/
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
{
int i;
@@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
int apicid = cpu_present_to_apicid(i);
if (physid_isset(apicid, phys_cpu_present_map)) {
cpu_set(i, cpu_present_map);
- /* possible map would be different if we supported real
- CPU hotplug. */
cpu_set(i, cpu_possible_map);
}
+ fixup_cpu_possible_map(i);
}
if (smp_sanity_check(max_cpus) < 0) {
@@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void)
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
+ cpu_set(0, cpu_sibling_map[0]);
+ cpu_set(0, cpu_core_map[0]);
+ per_cpu(cpu_state, me) = CPU_ONLINE;
}
/*
* Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
*/
int __cpuinit __cpu_up(unsigned int cpu)
{
@@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu)
return -EINVAL;
}
+ /*
+ * Already booted CPU?
+ */
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ Dprintk("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Boot it! */
err = do_boot_cpu(cpu, apicid);
if (err < 0) {
@@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu)
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
- return 0;
+ err = 0;
+
+ return err;
}
/*
* Finish the SMP boot.
*/
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
{
+#ifndef CONFIG_HOTPLUG_CPU
zap_low_mappings();
+#endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- detect_siblings();
time_init_gtod();
check_nmi_watchdog();
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+
+ for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+ cpu_clear(cpu, cpu_sibling_map[sibling]);
+ for_each_cpu_mask(sibling, cpu_core_map[cpu])
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
+ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ disable_APIC_timer();
+
+ /*
+ * HACK:
+ * Allow any queued timer interrupts to get serviced
+ * This is only a temporary solution until we cleanup
+ * fixup_irqs as we do for IA64.
+ */
+ local_irq_enable();
+ mdelay(1);
+
+ local_irq_disable();
+ remove_siblinginfo(cpu);
+
+ /* It's now safe to remove this processor from the online map */
+ cpu_clear(cpu, cpu_online_map);
+ remove_cpu_from_maps();
+ fixup_irqs(cpu_online_map);
+ return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We don't do anything here: idle task is faking death itself. */
+ unsigned int i;
+
+ for (i = 0; i < 10; i++) {
+ /* They ack this in play_dead by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ printk ("CPU %d is now offline\n", cpu);
+ return;
+ }
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d65..6c0f402e3a8 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
*/
asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
- asm volatile ("sldt %0" : "=m" (ctxt->ldt));
asm volatile ("str %0" : "=m" (ctxt->tr));
/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+ asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
}
void save_processor_state(void)
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt)
/*
* control registers
*/
+ asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
/*
+ * now restore the descriptor tables to their proper values
+ * ltr is done i fix_processor_context().
+ */
+ asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+ asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
+ /*
* segment registers
*/
asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
- /*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
- */
- asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
- asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
- asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
fix_processor_context();
do_fpu_end();
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index fb8c809b4cd..66bf6ddeb0c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -64,6 +64,7 @@ static int notsc __initdata = 0;
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
static unsigned long hpet_period; /* fsecs / HPET clock */
unsigned long hpet_tick; /* HPET clocks / interrupt */
+static int hpet_use_timer;
unsigned long vxtime_hz = PIT_TICK_RATE;
int report_lost_ticks; /* command line option */
unsigned long long monotonic_base;
@@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void)
static inline unsigned int do_gettimeoffset_hpet(void)
{
- return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32;
+ /* cap counter read to one tick to avoid inconsistencies */
+ unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
+ return (min(counter,hpet_tick) * vxtime.quot) >> 32;
}
unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void)
last_offset = vxtime.last;
base = monotonic_base;
- this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
+ this_offset = hpet_readl(HPET_COUNTER);
} while (read_seqretry(&xtime_lock, seq));
offset = (this_offset - last_offset);
@@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
write_seqlock(&xtime_lock);
- if (vxtime.hpet_address) {
+ if (vxtime.hpet_address)
+ offset = hpet_readl(HPET_COUNTER);
+
+ if (hpet_use_timer) {
+ /* if we're using the hpet timer functionality,
+ * we can more accurately know the counter value
+ * when the timer interrupt occured.
+ */
offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
delay = hpet_readl(HPET_COUNTER) - offset;
} else {
@@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick)
* Set up timer 0, as periodic with first interrupt to happen at hpet_tick,
* and period also hpet_tick.
*/
-
- hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ if (hpet_use_timer) {
+ hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
HPET_TN_32BIT, HPET_T0_CFG);
- hpet_writel(hpet_tick, HPET_T0_CMP);
- hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
-
+ hpet_writel(hpet_tick, HPET_T0_CMP);
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+ cfg |= HPET_CFG_LEGACY;
+ }
/*
* Go!
*/
- cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY;
+ cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
return 0;
@@ -834,8 +845,7 @@ static int hpet_init(void)
id = hpet_readl(HPET_ID);
- if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) ||
- !(id & HPET_ID_LEGSUP))
+ if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER))
return -1;
hpet_period = hpet_readl(HPET_PERIOD);
@@ -845,6 +855,8 @@ static int hpet_init(void)
hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
hpet_period;
+ hpet_use_timer = (id & HPET_ID_LEGSUP);
+
return hpet_timer_stop_set_go(hpet_tick);
}
@@ -901,9 +913,11 @@ void __init time_init(void)
set_normalized_timespec(&wall_to_monotonic,
-xtime.tv_sec, -xtime.tv_nsec);
- if (!hpet_init()) {
+ if (!hpet_init())
vxtime_hz = (1000000000000000L + hpet_period / 2) /
hpet_period;
+
+ if (hpet_use_timer) {
cpu_khz = hpet_calibrate_tsc();
timename = "HPET";
#ifdef CONFIG_X86_PM_TIMER
@@ -968,7 +982,7 @@ void __init time_init_gtod(void)
if (unsynchronized_tsc())
notsc = 1;
if (vxtime.hpet_address && notsc) {
- timetype = "HPET";
+ timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
vxtime.mode = VXTIME_HPET;
do_gettimeoffset = do_gettimeoffset_hpet;
@@ -983,7 +997,7 @@ void __init time_init_gtod(void)
printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
#endif
} else {
- timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC";
+ timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
vxtime.mode = VXTIME_TSC;
}
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 3dfec8fdabc..10273663000 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
{
int i;
- int in_kernel = (regs->cs & 3) == 0;
+ int in_kernel = !user_mode(regs);
unsigned long rsp;
const int cpu = safe_smp_processor_id();
struct task_struct *cur = cpu_pda[cpu].pcurrent;
@@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs)
struct bug_frame f;
char tmp;
- if (regs->cs & 3)
+ if (user_mode(regs))
return;
if (__copy_from_user(&f, (struct bug_frame *) regs->rip,
sizeof(struct bug_frame)))
@@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str,
}
#endif
- if ((regs->cs & 3) != 0) {
+ if (user_mode(regs)) {
struct task_struct *tsk = current;
if (exception_trace && unhandled_signal(tsk, signr))
@@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
}
#endif
- if ((regs->cs & 3)!=0) {
+ if (user_mode(regs)) {
struct task_struct *tsk = current;
if (exception_trace && unhandled_signal(tsk, SIGSEGV))
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
asmlinkage void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ int cpu;
+
+ cpu = smp_processor_id();
/* Only the BSP gets external NMIs from the system. */
- if (!smp_processor_id())
+ if (!cpu)
reason = get_nmi_reason();
+ if (!cpu_online(cpu))
+ return;
+
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
== NOTIFY_STOP)
@@ -638,7 +644,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs)
if (eregs == (struct pt_regs *)eregs->rsp)
;
/* Exception from user space */
- else if (eregs->cs & 3)
+ else if (user_mode(eregs))
regs = ((struct pt_regs *)current->thread.rsp0) - 1;
/* Exception from kernel and interrupts are enabled. Move to
kernel process stack. */
@@ -669,7 +675,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
}
#endif
- asm("movq %%db6,%0" : "=r" (condition));
+ get_debugreg(condition, 6);
if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
SIGTRAP) == NOTIFY_STOP)
@@ -697,7 +703,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
* allowing programs to debug themselves without the ptrace()
* interface.
*/
- if ((regs->cs & 3) == 0)
+ if (!user_mode(regs))
goto clear_TF_reenable;
/*
* Was the TF flag set by a debugger? If so, clear it now,
@@ -715,13 +721,13 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code)
info.si_signo = SIGTRAP;
info.si_errno = 0;
info.si_code = TRAP_BRKPT;
- if ((regs->cs & 3) == 0)
+ if (!user_mode(regs))
goto clear_dr7;
info.si_addr = (void __user *)regs->rip;
force_sig_info(SIGTRAP, &info, tsk);
clear_dr7:
- asm volatile("movq %0,%%db7"::"r"(0UL));
+ set_debugreg(0UL, 7);
return;
clear_TF_reenable:
@@ -756,7 +762,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
unsigned short cwd, swd;
conditional_sti(regs);
- if ((regs->cs & 3) == 0 &&
+ if (!user_mode(regs) &&
kernel_math_error(regs, "kernel x87 math error"))
return;
@@ -822,7 +828,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
unsigned short mxcsr;
conditional_sti(regs);
- if ((regs->cs & 3) == 0 &&
+ if (!user_mode(regs) &&
kernel_math_error(regs, "kernel simd math error"))
return;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda8..73389f51c4e 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*/
+#define LOAD_OFFSET __START_KERNEL_map
+
#include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
#include <linux/config.h>
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
SECTIONS
{
- . = 0xffffffff80100000;
+ . = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
_text = .; /* Text and read-only data */
- .text : {
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
*(.text)
SCHED_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
} = 0x9090
- .text.lock : { *(.text.lock) } /* out-of-line lock text */
+ /* out-of-line lock text */
+ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
_etext = .; /* End of text section */
. = ALIGN(16); /* Exception table */
__start___ex_table = .;
- __ex_table : { *(__ex_table) }
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
RODATA
- .data : { /* Data */
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
*(.data)
CONSTRUCTORS
}
@@ -40,62 +45,95 @@ SECTIONS
_edata = .; /* End of data section */
__bss_start = .; /* BSS */
- .bss : {
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
*(.bss.page_aligned)
*(.bss)
}
__bss_end = .;
+ . = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+ .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
-#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1))
-#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
- .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
- __vsyscall_0 = LOADADDR(.vsyscall_0);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
- xtime_lock = LOADADDR(.xtime_lock);
- .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
- vxtime = LOADADDR(.vxtime);
- .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
- wall_jiffies = LOADADDR(.wall_jiffies);
- .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
- sys_tz = LOADADDR(.sys_tz);
- .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
- sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
- .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
- xtime = LOADADDR(.xtime);
+ .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
+ xtime_lock = VVIRT(.xtime_lock);
+
+ .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+ vxtime = VVIRT(.vxtime);
+
+ .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
+ wall_jiffies = VVIRT(.wall_jiffies);
+
+ .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
+ sys_tz = VVIRT(.sys_tz);
+
+ .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
+ sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
+
+ .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
+ xtime = VVIRT(.xtime);
+
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
- jiffies = LOADADDR(.jiffies);
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
- . = LOADADDR(.vsyscall_0) + 4096;
+ .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+ . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
. = ALIGN(8192); /* init_task */
- .data.init_task : { *(.data.init_task) }
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ *(.data.init_task)
+ }
. = ALIGN(4096);
- .data.page_aligned : { *(.data.page_aligned) }
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
+ }
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
- .init.text : {
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
*(.init.text)
_einittext = .;
}
__initdata_begin = .;
- .init.data : { *(.init.data) }
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
__initdata_end = .;
. = ALIGN(16);
__setup_start = .;
- .init.setup : { *(.init.setup) }
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
__setup_end = .;
__initcall_start = .;
- .initcall.init : {
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
*(.initcall1.init)
*(.initcall2.init)
*(.initcall3.init)
@@ -106,32 +144,38 @@ SECTIONS
}
__initcall_end = .;
__con_initcall_start = .;
- .con_initcall.init : { *(.con_initcall.init) }
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ *(.con_initcall.init)
+ }
__con_initcall_end = .;
SECURITY_INIT
. = ALIGN(8);
__alt_instructions = .;
- .altinstructions : { *(.altinstructions) }
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ *(.altinstructions)
+ }
__alt_instructions_end = .;
- .altinstr_replacement : { *(.altinstr_replacement) }
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
- .exit.text : { *(.exit.text) }
- .exit.data : { *(.exit.data) }
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
. = ALIGN(4096);
__initramfs_start = .;
- .init.ramfs : { *(.init.ramfs) }
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
. = ALIGN(32);
__per_cpu_start = .;
- .data.percpu : { *(.data.percpu) }
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
- .data_nosave : { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
. = ALIGN(4096);
__nosave_end = .;
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index aed61a668a1..33a873a3c22 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -12,6 +12,7 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <asm/delay.h>
+#include <asm/msr.h>
#ifdef CONFIG_SMP
#include <asm/smp.h>
@@ -19,6 +20,12 @@
int x86_udelay_tsc = 0; /* Delay via TSC */
+int read_current_timer(unsigned long *timer_value)
+{
+ rdtscll(*timer_value);
+ return 0;
+}
+
void __delay(unsigned long loops)
{
unsigned bclock, now;
diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile
index 66c354ad80c..1d232a87f11 100644
--- a/arch/x86_64/mm/Makefile
+++ b/arch/x86_64/mm/Makefile
@@ -4,7 +4,7 @@
obj-y := init.o fault.o ioremap.o extable.o pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_K8_NUMA) += k8topology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 57d3ab15a5c..2f187986f94 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -74,7 +74,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
instr = (unsigned char *)convert_rip_to_linear(current, regs);
max_instr = instr + 15;
- if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE)
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
return 0;
while (scan_more && instr < max_instr) {
@@ -106,7 +106,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
/* Could check the LDT for lm, but for now it's good
enough to assume that long mode only uses well known
segments or kernel. */
- scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
+ scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
break;
case 0x60:
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index dbe53b4c7e6..72e4b364ed7 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -318,7 +318,7 @@ void zap_low_mappings(void)
flush_tlb_all();
}
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NUMA
void __init paging_init(void)
{
{
@@ -427,13 +427,16 @@ void __init mem_init(void)
reservedpages = 0;
/* this will put all low memory onto the freelists */
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
totalram_pages += numa_free_all_bootmem();
tmp = 0;
/* should count reserved pages here for all nodes */
#else
+
+#ifdef CONFIG_FLATMEM
max_mapnr = end_pfn;
if (!mem_map) BUG();
+#endif
totalram_pages += free_all_bootmem();
@@ -515,7 +518,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
/* Should check here against the e820 map to avoid double free */
-#ifdef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_NUMA
int nid = phys_to_nid(phys);
reserve_bootmem_node(NODE_DATA(nid), phys, len);
#else
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index 58aac23760e..6972df480d2 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -178,7 +178,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
return (__force void __iomem *)phys_to_virt(phys_addr);
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index fd9f25d7a6c..ac61c186eb0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -66,6 +66,13 @@ int __init compute_hash_shift(struct node *nodes, int numnodes)
return -1;
}
+#ifdef CONFIG_SPARSEMEM
+int early_pfn_to_nid(unsigned long pfn)
+{
+ return phys_to_nid(pfn << PAGE_SHIFT);
+}
+#endif
+
/* Initialize bootmem allocator for a node */
void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
@@ -80,6 +87,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
+ memory_present(nodeid, start_pfn, end_pfn);
nodedata_phys = find_e820_area(start, end, pgdat_size);
if (nodedata_phys == -1L)
panic("Cannot find memory pgdat in node %d\n", nodeid);
@@ -243,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
-__init void numa_add_cpu(int cpu)
+__cpuinit void numa_add_cpu(int cpu)
{
/* BP is initialized elsewhere */
if (cpu)
diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c
index 62349c78db5..7e7d0c2a002 100644
--- a/arch/x86_64/pci/k8-bus.c
+++ b/arch/x86_64/pci/k8-bus.c
@@ -53,25 +53,11 @@ fill_mp_bus_to_cpumask(void)
for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus);
j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus);
j++)
- pci_bus_to_cpumask[j] =
- node_to_cpumask(NODE_ID(nid));
+ pci_bus_to_node[j] = NODE_ID(nid);
}
}
}
- /* quick sanity check */
- printed = 0;
- for (i = 0; i < 256; i++) {
- if (cpus_empty(pci_bus_to_cpumask[i])) {
- pci_bus_to_cpumask[i] = CPU_MASK_ALL;
- if (printed)
- continue;
- printk(KERN_ERR
- "k8-bus.c: some busses have empty cpu mask\n");
- printed = 1;
- }
- }
-
return 0;
}