diff options
Diffstat (limited to 'arch/x86_64')
48 files changed, 1552 insertions, 379 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 289f448ac89..d09437b5c48 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -207,33 +207,6 @@ config SMP If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP @@ -244,6 +217,8 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/Kconfig.preempt" + config K8_NUMA bool "K8 NUMA support" select NUMA @@ -265,7 +240,7 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -config DISCONTIGMEM +config ARCH_DISCONTIGMEM_ENABLE bool depends on NUMA default y @@ -274,6 +249,27 @@ config NUMA bool default n +config ARCH_DISCONTIGMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_DISCONTIGMEM_DEFAULT + def_bool y + depends on NUMA + +config ARCH_SPARSEMEM_ENABLE + def_bool y + depends on NUMA + +config ARCH_FLATMEM_ENABLE + def_bool y + depends on !NUMA + +source "mm/Kconfig" + +config HAVE_ARCH_EARLY_PFN_TO_NID + def_bool y + config HAVE_DEC_LOCK bool depends on SMP @@ -292,6 +288,15 @@ config NR_CPUS This is purely to save memory - each supported CPU requires memory in the static kernel configuration. +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL + help + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu/cpu#. + Say N if you want to disable CPU hotplug. + + config HPET_TIMER bool default y @@ -364,6 +369,34 @@ config X86_MCE_INTEL Additional support for intel specific MCE features such as the thermal monitor. +config PHYSICAL_START + hex "Physical address where the kernel is loaded" if EMBEDDED + default "0x100000" + help + This gives the physical address where the kernel is loaded. + Primarily used in the case of kexec on panic where the + fail safe kernel needs to run at a different address than + the panic-ed kernel. + + Don't change this unless you know what you are doing. + +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS @@ -381,6 +414,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source kernel/Kconfig.hz + endmenu # diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 6f90c246c41..8a73794f9b9 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := -e stext +LDFLAGS_vmlinux := CHECKFLAGS += -D__x86_64__ -m64 diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 27264dbd575..6f55565e4d4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -2,8 +2,6 @@ * linux/boot/head.S * * Copyright (C) 1991, 1992, 1993 Linus Torvalds - * - * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ */ /* @@ -21,13 +19,14 @@ */ /* - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ .code32 .text #include <linux/linkage.h> #include <asm/segment.h> +#include <asm/page.h> .code32 .globl startup_32 @@ -77,7 +76,7 @@ startup_32: jnz 3f addl $8,%esp xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START /* * We come here, if we were loaded high. @@ -103,7 +102,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $0x100000,%edi + movl $__PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine @@ -128,7 +127,7 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $0x100000 + ljmp $(__KERNEL_CS), $__PHYSICAL_START move_routine_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index c8b9216f9e6..b38d5b8b5fb 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -11,6 +11,7 @@ #include "miscsetup.h" #include <asm/io.h> +#include <asm/page.h> /* * gzip declarations @@ -92,8 +93,11 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); +void* memset(void* s, int c, unsigned n); +void* memcpy(void* dest, const void* src, unsigned n); + static void putstr(const char *); - + extern int end; static long free_mem_ptr = (long)&end; static long free_mem_end_ptr; @@ -284,7 +288,7 @@ void setup_normal_output_buffer(void) #else if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (char *)0x100000; /* Points to 1M */ + output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(0x100000 + low_buffer_size); + if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh index 90f2452b3b9..198af15a775 100644 --- a/arch/x86_64/boot/install.sh +++ b/arch/x86_64/boot/install.sh @@ -1,6 +1,6 @@ #!/bin/sh # -# arch/i386/boot/install.sh +# arch/x86_64/boot/install.sh # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive @@ -21,8 +21,8 @@ # User may have a custom install script -if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi -if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi +if [ -x ~/bin/${CROSS_COMPILE}installkernel ]; then exec ~/bin/${CROSS_COMPILE}installkernel "$@"; fi +if [ -x /sbin/${CROSS_COMPILE}installkernel ]; then exec /sbin/${CROSS_COMPILE}installkernel "$@"; fi # Default install - same as make zlilo diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 75d4d2ad93b..ff58b2832b7 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -33,7 +33,7 @@ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. * <stiker@northlink.com> * - * Fix to work around buggy BIOSes which dont use carry bit correctly + * Fix to work around buggy BIOSes which don't use carry bit correctly * and/or report extended memory in CX/DX for e801h memory size detection * call. As a result the kernel got wrong figures. The int15/e801h docs * from Ralf Brown interrupt list seem to indicate AX/BX should be used @@ -383,7 +383,7 @@ sse_ok: # a whole bunch of different types, and allows memory holes and # everything. We scan through this memory map and build a list # of the first 32 memory areas, which we return at [E820MAP]. -# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm +# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification. #define SMAP 0x534d4150 @@ -436,7 +436,7 @@ bail820: meme801: stc # fix to work around buggy - xorw %cx,%cx # BIOSes which dont clear/set + xorw %cx,%cx # BIOSes which don't clear/set xorw %dx,%dx # carry on pass/error of # e801h memory size call # or merely pass cx,dx though @@ -733,7 +733,7 @@ flush_instr: # # but we yet haven't reloaded the CS register, so the default size # of the target offset still is 16 bit. -# However, using an operant prefix (0x66), the CPU will properly +# However, using an operand prefix (0x66), the CPU will properly # take our 48 bit far pointer. (INTeL 80386 Programmer's Reference # Manual, Mixing 16-bit and 32-bit code, page 16-6) diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c2fa6631317..18b5bac1c42 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -1,6 +1,4 @@ /* - * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1997 Martin Mares */ @@ -8,7 +6,8 @@ /* * This file builds a disk-image from three different files: * - * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - bootsect: compatibility mbr which prints an error message if + * someone tries to boot the kernel directly. * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system * diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index fbd09b5126c..66e2821533d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -428,8 +428,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) return (void __user *)((rsp - frame_size) & -8UL); } -void ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs * regs) { struct sigframe __user *frame; int err = 0; @@ -514,14 +514,15 @@ void ia32_setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; int err = 0; @@ -613,9 +614,9 @@ void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } - diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index f3ca0db85b5..cc935427d53 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -589,7 +589,7 @@ ia32_sys_call_table: .quad compat_sys_mq_timedreceive /* 280 */ .quad compat_sys_mq_notify .quad compat_sys_mq_getsetattr - .quad quiet_ni_syscall /* reserved for kexec */ + .quad compat_sys_kexec_load /* reserved for kexec */ .quad compat_sys_waitid .quad quiet_ni_syscall /* sys_altroot */ .quad sys_add_key diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 5ca4a4598fd..48f9e2c19cd 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index a4c630034cd..185faa911db 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -67,7 +67,7 @@ wakeup_code: shll $4, %eax addl $(gdta - wakeup_code), %eax movl %eax, gdt_48a +2 - wakeup_code - lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is + lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate movl $1, %eax # protected mode (PE) bit diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 504e6347499..c9a6b812e92 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -40,11 +40,7 @@ int fix_aperture __initdata = 1; static u32 __init allocate_aperture(void) { -#ifdef CONFIG_DISCONTIGMEM pg_data_t *nd0 = NODE_DATA(0); -#else - pg_data_t *nd0 = &contig_page_data; -#endif u32 aper_size; void *p; diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index f8e6cc4fecd..375d369570c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void) } } -void disconnect_bsp_APIC(void) +void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* @@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } + else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -285,7 +321,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __init setup_local_APIC (void) +void __cpuinit setup_local_APIC (void) { unsigned int value, ver, maxlvt; @@ -534,7 +570,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __init apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void) local_irq_enable(); } -void __init setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ setup_APIC_timer(calibration_result); local_irq_enable(); } -void __init disable_APIC_timer(void) +void __cpuinit disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c new file mode 100644 index 00000000000..d7fa4248501 --- /dev/null +++ b/arch/x86_64/kernel/crash.c @@ -0,0 +1,35 @@ +/* + * Architecture specific (x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> + +note_buf_t crash_notes[NR_CPUS]; + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ +} diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 7c154dfff64..6ded3a50dfe 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -16,6 +16,7 @@ #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/string.h> +#include <linux/kexec.h> #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> @@ -191,8 +192,6 @@ void __init e820_reserve_resources(void) int i; for (i = 0; i < e820.nr_map; i++) { struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; res = alloc_bootmem_low(sizeof(struct resource)); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; @@ -212,6 +211,9 @@ void __init e820_reserve_resources(void) */ request_resource(res, &code_resource); request_resource(res, &data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index e3a19e8ebbf..9631c747c5e 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,20 +2,24 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/tty.h> #include <asm/io.h> #include <asm/processor.h> /* Simple VGA output */ #ifdef __i386__ +#include <asm/setup.h> #define VGABASE (__ISA_IO_base + 0xb8000) #else +#include <asm/bootsetup.h> #define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif -#define MAX_YPOS 25 -#define MAX_XPOS 80 +#define MAX_YPOS max_ypos +#define MAX_XPOS max_xpos +static int max_ypos = 25, max_xpos = 80; static int current_ypos = 1, current_xpos = 0; static void early_vga_write(struct console *con, const char *str, unsigned n) @@ -196,7 +200,10 @@ int __init setup_early_printk(char *opt) } else if (!strncmp(buf, "ttyS", 4)) { early_serial_init(buf); early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { + } else if (!strncmp(buf, "vga", 3) + && SCREEN_INFO.orig_video_isVGA == 1) { + max_xpos = SCREEN_INFO.orig_video_cols; + max_ypos = SCREEN_INFO.orig_video_lines; early_console = &early_vga_console; } early_console_initialized = 1; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index b4cbbad0422..28284696508 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -7,6 +7,8 @@ * Hacked for x86-64 by James Cleverdon from i386 architecture code by * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. + * Ashok Raj <ashok.raj@intel.com> + * Removed IPI broadcast shortcut to support CPU hotplug */ #include <linux/config.h> #include <linux/threads.h> @@ -18,6 +20,46 @@ #include <asm/smp.h> #include <asm/ipi.h> +/* + * The following permit choosing broadcast IPI shortcut v.s sending IPI only + * to online cpus via the send_IPI_mask varient. + * The mask version is my preferred option, since it eliminates a lot of + * other extra code that would need to be written to cleanup intrs sent + * to a CPU while offline. + * + * Sending broadcast introduces lots of trouble in CPU hotplug situations. + * These IPI's are delivered to cpu's irrespective of their offline status + * and could pickup stale intr data when these CPUS are turned online. + * + * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with + * the idea of not using broadcast IPI's anymore. Hence the run time check + * is introduced, on his request so we can choose an alternate mechanism. + * + * Initial wacky performance tests that collect cycle counts show + * no increase in using mask v.s broadcast version. In fact they seem + * identical in terms of cycle counts. + * + * if we need to use broadcast, we need to do the following. + * + * cli; + * hold call_lock; + * clear any pending IPI, just ack and clear all pending intr + * set cpu_online_map; + * release call_lock; + * sti; + * + * The complicated dummy irq processing shown above is not required if + * we didnt sent IPI's to wrong CPU's in the first place. + * + * - Ashok Raj <ashok.raj@intel.com> + */ +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI (1) +#else +#define DEFAULT_SEND_IPI (0) +#endif + +static int no_broadcast=DEFAULT_SEND_IPI; static cpumask_t flat_target_cpus(void) { @@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static void flat_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then - * we get an APIC send error if we try to broadcast. - * thus we have to avoid sending IPIs in this case. - */ - if (num_online_cpus() > 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); -} - -static void flat_send_IPI_all(int vector) -{ - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); -} - static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; @@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static inline void __local_flat_send_IPI_allbutself(int vector) +{ + if (no_broadcast) { + cpumask_t mask = cpu_online_map; + int this_cpu = get_cpu(); + + cpu_clear(this_cpu, mask); + flat_send_IPI_mask(mask, vector); + put_cpu(); + } + else + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static inline void __local_flat_send_IPI_all(int vector) +{ + if (no_broadcast) + flat_send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_allbutself(int vector) +{ + if (((num_online_cpus()) - 1) >= 1) + __local_flat_send_IPI_allbutself(vector); +} + +static void flat_send_IPI_all(int vector) +{ + __local_flat_send_IPI_all(vector); +} + static int flat_apic_id_registered(void) { return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); @@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb) return ((ebx >> 24) & 0xFF) >> index_msb; } +static __init int no_ipi_broadcast(char *str) +{ + get_option(&str, &no_broadcast); + printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : + "IPI Broadcast"); + return 1; +} + +__setup("no_ipi_broadcast", no_ipi_broadcast); + struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, @@ -125,3 +194,12 @@ struct genapic apic_flat = { .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, }; + +static int __init print_ipi_mode(void) +{ + printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : + "Shortcut"); + return 0; +} + +late_initcall(print_ipi_mode); diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 9bd2e7a4b81..8d765aa77a2 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -248,23 +248,23 @@ ENTRY(_stext) */ .org 0x1000 ENTRY(init_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ .org 0x2000 ENTRY(level3_ident_pgt) - .quad 0x0000000000104007 + .quad 0x0000000000004007 + __PHYSICAL_START .fill 511,8,0 .org 0x3000 ENTRY(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */ .fill 1,8,0 .org 0x4000 @@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table) .org 0xa000 ENTRY(level3_physmem_pgt) - .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ .org 0xb000 #ifdef CONFIG_ACPI_SLEEP ENTRY(wakeup_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ .fill 255,8,0 - .quad 0x000000000010a007 + .quad 0x000000000000a007 + __PHYSICAL_START .fill 254,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif .data diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 0f8c78dcd38..cf6ab147a2a 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -94,7 +94,7 @@ void __init x86_64_start_kernel(char * real_mode_data) s = strstr(saved_command_line, "earlyprintk="); if (s != NULL) setup_early_printk(s); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA s = strstr(saved_command_line, "numa="); if (s != NULL) numa_setup(s+5); diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index ba139cac57c..d9b22b633e3 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c @@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ -void __init fpu_init(void) +void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); extern void __bad_fxsave_alignment(void); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 7873d9ba881..a8916909512 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -157,14 +157,13 @@ static unsigned int startup_8259A_irq(unsigned int irq) } static struct hw_interrupt_type i8259A_irq_type = { - "XT-PIC", - startup_8259A_irq, - shutdown_8259A_irq, - enable_8259A_irq, - disable_8259A_irq, - mask_and_ack_8259A, - end_8259A_irq, - NULL + .typename = "XT-PIC", + .startup = startup_8259A_irq, + .shutdown = shutdown_8259A_irq, + .enable = enable_8259A_irq, + .disable = disable_8259A_irq, + .ack = mask_and_ack_8259A, + .end = end_8259A_irq, }; /* @@ -415,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afd87e64d0a..157190d986b 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type) /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int __init find_isa_irq_pin(int irq, int type) +static int find_isa_irq_pin(int irq, int type) { int i; @@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - disconnect_bsp_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + disconnect_bsp_APIC(pin != -1); } /* diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 62b112e4deb..cc3fb85f514 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/module.h> +#include <linux/delay.h> #include <asm/uaccess.h> #include <asm/io_apic.h> @@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) return 1; } +#ifdef CONFIG_HOTPLUG_CPU +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index f77f8a0ff18..4e680f87a75 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -27,6 +27,8 @@ * <prasanna@in.ibm.com> adapted for x86_64 * 2005-Mar Roland McGrath <roland@redhat.com> * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality */ #include <linux/config.h> @@ -37,18 +39,16 @@ #include <linux/slab.h> #include <linux/preempt.h> #include <linux/moduleloader.h> - +#include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/kdebug.h> static DECLARE_MUTEX(kprobe_mutex); -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_rsp; static kprobe_opcode_t *get_insn_slot(void); @@ -214,6 +214,21 @@ void arch_copy_kprobe(struct kprobe *p) BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ *ripdisp = disp; } + p->opcode = *p->addr; +} + +void arch_arm_kprobe(struct kprobe *p) +{ + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + *p->addr = p->opcode; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } void arch_remove_kprobe(struct kprobe *p) @@ -223,10 +238,29 @@ void arch_remove_kprobe(struct kprobe *p) down(&kprobe_mutex); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +static inline void save_previous_kprobe(void) { - *p->addr = p->opcode; - regs->rip = (unsigned long)p->addr; + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_old_rflags_prev = kprobe_old_rflags; + kprobe_saved_rflags_prev = kprobe_saved_rflags; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_old_rflags = kprobe_old_rflags_prev; + kprobe_saved_rflags = kprobe_saved_rflags_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + current_kprobe = p; + kprobe_saved_rflags = kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kprobe_saved_rflags &= ~IF_MASK; } static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -240,6 +274,50 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->rip = (unsigned long)p->ainsn.insn; } +struct task_struct *arch_get_kprobe_task(void *ptr) +{ + return ((struct thread_info *) (((unsigned long) ptr) & + (~(THREAD_SIZE -1))))->task; +} + +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)regs->rsp; + struct kretprobe_instance *ri; + static void *orig_ret_addr; + + /* + * Save the return address when the return probe hits + * the first time, and use it to populate the (krprobe + * instance)->ret_addr for subsequent return probes at + * the same addrress since stack address would have + * the kretprobe_trampoline by then. + */ + if (((void*) *sara) != kretprobe_trampoline) + orig_ret_addr = (void*) *sara; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->stack_addr = sara; + ri->ret_addr = orig_ret_addr; + add_rp_inst(ri); + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; + } else { + rp->nmissed++; + } +} + +void arch_kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + while ((ri = get_rp_inst_tsk(tk)) != NULL) { + *((unsigned long *)(ri->stack_addr)) = + (unsigned long) ri->ret_addr; + recycle_rp_inst(ri); + } +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled thorough out this function. @@ -264,9 +342,30 @@ int kprobe_handler(struct pt_regs *regs) regs->eflags |= kprobe_saved_rflags; unlock_kprobes(); goto no_kprobe; + } else if (kprobe_status == KPROBE_HIT_SSDONE) { + /* TODO: Provide re-entrancy from + * post_kprobes_handler() and avoid exception + * stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->rip = (unsigned long)p->addr; + ret = 1; + } else { + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the + * handler. We here save the original kprobe + * variables and just single step on instruction + * of the new probe without calling any user + * handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } - disarm_kprobe(p, regs); - ret = 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -296,11 +395,7 @@ int kprobe_handler(struct pt_regs *regs) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; - kprobe_saved_rflags = kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kprobe_saved_rflags &= ~IF_MASK; + set_current_kprobe(p, regs); if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -317,6 +412,55 @@ no_kprobe: } /* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct task_struct *tsk; + struct kretprobe_instance *ri; + struct hlist_head *head; + struct hlist_node *node; + unsigned long *sara = (unsigned long *)regs->rsp - 1; + + tsk = arch_get_kprobe_task(sara); + head = kretprobe_inst_table_head(tsk); + + hlist_for_each_entry(ri, node, head, hlist) { + if (ri->stack_addr == sara && ri->rp) { + if (ri->rp->handler) + ri->rp->handler(ri, regs); + } + } + return 0; +} + +void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kretprobe_instance *ri; + /* RA already popped */ + unsigned long *sara = ((unsigned long *)regs->rsp) - 1; + + while ((ri = get_rp_inst(sara))) { + regs->rip = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + } + regs->eflags &= ~TF_MASK; +} + +/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "int 3" * instruction. To avoid the SMP problems that can occur when we @@ -401,13 +545,23 @@ int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } - resume_execution(current_kprobe, regs); + if (current_kprobe->post_handler != trampoline_post_handler) + resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_rflags; - unlock_kprobes(); + /* Restore the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } else { + unlock_kprobes(); + } +out: preempt_enable_no_resched(); /* diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c new file mode 100644 index 00000000000..60d1eff4156 --- /dev/null +++ b/arch/x86_64/kernel/machine_kexec.c @@ -0,0 +1,250 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/string.h> +#include <linux/reboot.h> +#include <asm/pda.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/hw_irq.h> + +#define LEVEL0_SIZE (1UL << 12UL) +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +#define LEVEL3_SIZE (1UL << 39UL) +#define LEVEL4_SIZE (1UL << 48UL) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE) +#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +static void init_level2_page(u64 *level2p, unsigned long addr) +{ + unsigned long end_addr; + + addr &= PAGE_MASK; + end_addr = addr + LEVEL2_SIZE; + while (addr < end_addr) { + *(level2p++) = addr | L1_ATTR; + addr += LEVEL1_SIZE; + } +} + +static int init_level3_page(struct kimage *image, u64 *level3p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL3_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + u64 *level2p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level2p = (u64 *)page_address(page); + init_level2_page(level2p, addr); + *(level3p++) = __pa(level2p) | L2_ATTR; + addr += LEVEL2_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + *(level3p++) = 0; + addr += LEVEL2_SIZE; + } +out: + return result; +} + + +static int init_level4_page(struct kimage *image, u64 *level4p, + unsigned long addr, unsigned long last_addr) +{ + unsigned long end_addr; + int result; + + result = 0; + addr &= PAGE_MASK; + end_addr = addr + LEVEL4_SIZE; + while ((addr < last_addr) && (addr < end_addr)) { + struct page *page; + u64 *level3p; + + page = kimage_alloc_control_pages(image, 0); + if (!page) { + result = -ENOMEM; + goto out; + } + level3p = (u64 *)page_address(page); + result = init_level3_page(image, level3p, addr, last_addr); + if (result) { + goto out; + } + *(level4p++) = __pa(level3p) | L3_ATTR; + addr += LEVEL3_SIZE; + } + /* clear the unused entries */ + while (addr < end_addr) { + *(level4p++) = 0; + addr += LEVEL3_SIZE; + } +out: + return result; +} + + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + u64 *level4p; + level4p = (u64 *)__va(start_pgtable); + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); +} + +static void set_idt(void *newidt, u16 limit) +{ + unsigned char curidt[10]; + + /* x86-64 supports unaliged loads & stores */ + (*(u16 *)(curidt)) = limit; + (*(u64 *)(curidt +2)) = (unsigned long)(newidt); + + __asm__ __volatile__ ( + "lidt %0\n" + : "=m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + unsigned char curgdt[10]; + + /* x86-64 supports unaligned loads & stores */ + (*(u16 *)(curgdt)) = limit; + (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt); + + __asm__ __volatile__ ( + "lgdt %0\n" + : "=m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%ss\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + ); +#undef STR +#undef __STR +} + +typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, + unsigned long control_code_buffer, + unsigned long start_address, + unsigned long pgtable) ATTRIB_NORET; + +const extern unsigned char relocate_new_kernel[]; +const extern unsigned long relocate_new_kernel_size; + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable, control_code_buffer; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + /* Place the code in the reboot code buffer */ + memcpy(__va(control_code_buffer), relocate_new_kernel, + relocate_new_kernel_size); + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + return; +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list; + unsigned long control_code_buffer; + unsigned long start_pgtable; + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Calculate the offsets */ + page_list = image->head; + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + control_code_buffer = start_pgtable + 4096UL; + + /* Set the low half of the page table to my identity mapped + * page table for kexec. Leave the high half pointing at the + * kernel pages. Don't bother to flush the global pages + * as that will happen when I fully switch to my identity mapped + * page table anyway. + */ + memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); + __flush_tlb(); + + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again unless you set the segment to a different selector. + * + * The more common model are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + /* now call it */ + rnk = (relocate_new_kernel_t) control_code_buffer; + (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); +} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 3a89d735a4f..21e70625a49 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -327,7 +327,7 @@ static void mce_init(void *dummy) } /* Add per CPU specific workarounds here */ -static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { @@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) } } -static void __init mce_cpu_features(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c) * Called for each booted CPU to set up machine checks. * Must be called with preempt off. */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; @@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff memset(mcelog.entry, 0, next * sizeof(struct mce)); mcelog.next = 0; - synchronize_kernel(); + synchronize_sched(); /* Collect entries that were still getting written before the synchronize. */ @@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) -static __init int mce_init_device(void) +static __cpuinit int mce_init_device(void) { int err; if (!mce_available(&boot_cpu_data)) diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 4db9a640069..0be0a795981 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -42,7 +42,7 @@ done: irq_exit(); } -static void __init intel_init_thermal(struct cpuinfo_x86 *c) +static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; int tm2 = 0; @@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c) return; } -void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); } diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61a63be6b29..9c5aa2a790c 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -23,6 +23,7 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi.h> +#include <linux/module.h> #include <asm/smp.h> #include <asm/mtrr.h> @@ -45,7 +46,8 @@ int acpi_found_madt; int apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; +unsigned char pci_bus_to_node [256]; +EXPORT_SYMBOL(pci_bus_to_node); static int mp_current_pci_id = 0; /* I/O APIC entries */ @@ -904,11 +906,20 @@ void __init mp_config_acpi_legacy_irqs (void) return; } +#define MAX_GSI_NUM 4096 + int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrupts, which + * represent all possible interrupts, to the IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -943,11 +954,21 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; + return gsi_to_irq[gsi]; } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + if (edge_level) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 31c0f2e6ac9..4e44d6e6b7e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -static __init inline int nmi_known_cpu(void) +static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __init nmi_watchdog_default(void) +void __cpuinit nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index dce8bab4306..1d91271796e 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -8,7 +8,8 @@ * * X86-64 port * Andi Kleen. - * + * + * CPU hotplug support - ashok.raj@intel.com * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ */ @@ -18,6 +19,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -34,6 +36,7 @@ #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -153,6 +156,29 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); +#ifdef CONFIG_HOTPLUG_CPU +DECLARE_PER_CPU(int, cpu_state); + +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + idle_task_exit(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + while (1) + safe_halt(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a @@ -173,6 +199,8 @@ void cpu_idle (void) idle = pm_idle; if (!idle) idle = default_idle; + if (cpu_is_offline(smp_processor_id())) + play_dead(); idle(); } @@ -203,7 +231,7 @@ static void mwait_idle(void) } } -void __init select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { static int printed; if (cpu_has(c, X86_FEATURE_MWAIT)) { @@ -293,6 +321,14 @@ void exit_thread(void) { struct task_struct *me = current; struct thread_struct *t = &me->thread; + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(me); + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); @@ -312,6 +348,13 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index be4b36f762c..57e71dbdfd6 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -66,41 +66,47 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_SMP -static void smp_halt(void) +static inline void kb_wait(void) { - int cpuid = safe_smp_processor_id(); - static int first_entry = 1; + int i; - if (reboot_force) - return; + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} - if (first_entry) { - first_entry = 0; - smp_call_function((void *)machine_restart, NULL, 1, 0); - } - - smp_stop_cpu(); +void machine_shutdown(void) +{ + /* Stop the cpus and apics */ +#ifdef CONFIG_SMP + int reboot_cpu_id; - /* AP calling this. Just halt */ - if (cpuid != boot_cpu_id) { - for (;;) - asm("hlt"); + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* Wait for all other CPUs to have run smp_stop_cpu */ - while (!cpus_empty(cpu_online_map)) - rep_nop(); -} + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. + */ + smp_send_stop(); #endif -static inline void kb_wait(void) -{ - int i; + local_irq_disable(); - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; +#ifndef CONFIG_SMP + disable_local_APIC(); +#endif + + disable_IO_APIC(); + + local_irq_enable(); } void machine_restart(char * __unused) @@ -109,9 +115,7 @@ void machine_restart(char * __unused) printk("machine restart\n"); -#ifdef CONFIG_SMP - smp_halt(); -#endif + machine_shutdown(); if (!reboot_force) { local_irq_disable(); diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S new file mode 100644 index 00000000000..d24fa9b72a2 --- /dev/null +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -0,0 +1,143 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel + .code64 +relocate_new_kernel: + /* %rdi page_list + * %rsi reboot_code_buffer + * %rdx start address + * %rcx page_table + * %r8 arg5 + * %r9 arg6 + */ + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* set a new stack at the bottom of our page... */ + lea 4096(%rsi), %rsp + + /* store the parameters back on the stack */ + pushq %rdx /* store the start address */ + + /* Set cr0 to a known state: + * 31 1 == Paging enabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movq %cr0, %rax + andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax + orl $((1<<31)|(1<<0)), %eax + movq %rax, %cr0 + + /* Set cr4 to a known state: + * 10 0 == xmm exceptions disabled + * 9 0 == xmm registers instructions disabled + * 8 0 == performance monitoring counter disabled + * 7 0 == page global disabled + * 6 0 == machine check exceptions disabled + * 5 1 == physical address extension enabled + * 4 0 == page size extensions disabled + * 3 0 == Debug extensions disabled + * 2 0 == Time stamp disable (disabled) + * 1 0 == Protected mode virtual interrupts disabled + * 0 0 == VME disabled + */ + + movq $((1<<5)), %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Switch to the identity mapped page tables, + * and flush the TLB. + */ + movq %rcx, %cr3 + + /* Do the copies */ + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorq %rdi, %rdi + xorq %rsi, %rsi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq $512, %rcx + rep ; movsq + jmp 0b +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* set all of the registers to known values */ + /* leave %rsp alone */ + + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rcx, %rcx + xorq %rdx, %rdx + xorq %rsi, %rsi + xorq %rdi, %rdi + xorq %rbp, %rbp + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r9 + xorq %r11, %r11 + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 99f038ede23..b02d921da4f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -40,6 +40,9 @@ #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> +#include <linux/mmzone.h> +#include <linux/kexec.h> + #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -348,7 +351,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "mem=", 4)) parse_memopt(from+4, &from); -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA if (!memcmp(from, "numa=", 5)) numa_setup(from+5); #endif @@ -365,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (!memcmp(from, "noexec=", 7)) nonx_setup(from + 7); +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } +#endif + next_char: c = *(from++); if (!c) @@ -377,17 +401,20 @@ static __init void parse_cmdline_early (char ** cmdline_p) *cmdline_p = command_line; } -#ifndef CONFIG_DISCONTIGMEM -static void __init contig_initmem_init(void) +#ifndef CONFIG_NUMA +static void __init +contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long bootmap_size, bootmap; - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); - if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); - bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); - reserve_bootmem(bootmap, bootmap_size); + unsigned long bootmap_size, bootmap; + + memory_present(0, start_pfn, end_pfn); + bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n",bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + reserve_bootmem(bootmap, bootmap_size); } #endif @@ -554,10 +581,10 @@ void __init setup_arch(char **cmdline_p) acpi_numa_init(); #endif -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA numa_initmem_init(0, end_pfn); #else - contig_initmem_init(); + contig_initmem_init(0, end_pfn); #endif /* Reserve direct mapping */ @@ -618,6 +645,15 @@ void __init setup_arch(char **cmdline_p) } } #endif + + sparse_init(); + +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) { + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); + } +#endif paging_init(); check_ioapic(); @@ -669,7 +705,7 @@ void __init setup_arch(char **cmdline_p) #endif } -static int __init get_model_name(struct cpuinfo_x86 *c) +static int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; @@ -685,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c) } -static void __init display_cacheinfo(struct cpuinfo_x86 *c) +static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, eax, ebx, ecx, edx; @@ -796,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c) return r; } -static void __init detect_ht(struct cpuinfo_x86 *c) +static void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; @@ -857,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax; @@ -875,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __init init_intel(struct cpuinfo_x86 *c) +static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; @@ -895,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c) c->x86_num_cores = intel_num_cpu_cores(c); } -void __init get_cpu_vendor(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -916,7 +952,7 @@ struct cpu_model_info { /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __init early_identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { u32 tfms; @@ -970,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; u32 xlvl; @@ -1047,7 +1083,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c) } -void __init print_cpu_info(struct cpuinfo_x86 *c) +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) printk("%s", c->x86_model_id); diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 678b7ac33b8..f2b9c6bc999 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -29,7 +29,7 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; @@ -171,7 +171,7 @@ void syscall_init(void) wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); } -void __init check_efer(void) +void __cpuinit check_efer(void) { unsigned long efer; @@ -188,7 +188,7 @@ void __init check_efer(void) * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __init cpu_init (void) +void __cpuinit cpu_init (void) { #ifdef CONFIG_SMP int cpu = stack_smp_processor_id(); diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 429c0269dc4..98590a989f3 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -34,9 +34,9 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs); -void ia32_setup_frame(int sig, struct k_sigaction *ka, +int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); asmlinkage long @@ -238,7 +238,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(rsp - size, 16); } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; @@ -327,20 +327,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + int ret; + #ifdef DEBUG_SIG printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, @@ -384,20 +387,22 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { if (ka->sa.sa_flags & SA_SIGINFO) - ia32_setup_rt_frame(sig, ka, info, oldset, regs); + ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); else - ia32_setup_frame(sig, ka, oldset, regs); + ret = ia32_setup_frame(sig, ka, oldset, regs); } else #endif - setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } + + return ret; } /* @@ -417,10 +422,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->cs & 3) != 3) + if (!user_mode(regs)) return 1; - if (try_to_freeze(0)) + if (try_to_freeze()) goto no_signal; if (!oldset) @@ -434,11 +439,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (current->thread.debugreg7) - asm volatile("movq %0,%%db7" : : "r" (current->thread.debugreg7)); + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; + return handle_signal(signr, &info, &ka, oldset, regs); } no_signal: diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 1e379ed17b1..ccae392886a 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -283,6 +283,16 @@ struct call_data_struct { static struct call_data_struct * call_data; +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + /* * this function sends a 'generic call function' IPI to all other CPUs * in the system. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f1ec0f34594..b969ee12872 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -34,6 +34,7 @@ * Andi Kleen : Converted to new state machine. * Various cleanups. * Probably mostly hotplug CPU ready now. + * Ashok Raj : CPU hotplug support */ @@ -58,11 +59,6 @@ #include <asm/proto.h> #include <asm/nmi.h> -/* Change for real CPU hotplug. Note other files need to be fixed - first too. */ -#define __cpuinit __init -#define __cpuinitdata __initdata - /* Number of siblings per CPU package */ int smp_num_siblings = 1; /* Package ID of each logical CPU */ @@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map); extern unsigned char trampoline_data[]; extern unsigned char trampoline_end[]; +/* State of each CPU */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +/* + * Store all idle threads, this can be reused instead of creating + * a new thread. Also avoids complicated thread destroy functionality + * for idle threads. + */ +struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; + +#define get_idle_for_cpu(x) (idle_thread_array[(x)]) +#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) + +/* + * cpu_possible_map should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * If cpu-hotplug is supported, then we need to preallocate for all + * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. + * - Ashok Raj + */ +#ifdef CONFIG_HOTPLUG_CPU +#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) +#else +#define fixup_cpu_possible_map(x) +#endif + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -418,6 +445,33 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static inline void set_cpu_sibling_map(int cpu) +{ + int i; + + if (smp_num_siblings > 1) { + for_each_cpu(i) { + if (cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_num_cores > 1) { + for_each_cpu(i) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } +} + /* * Setup code on secondary processor (after comming out of the trampoline) */ @@ -448,9 +502,28 @@ void __cpuinit start_secondary(void) enable_APIC_timer(); /* + * The sibling maps must be set before turing the online map on for + * this cpu + */ + set_cpu_sibling_map(smp_processor_id()); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI in genapic_flat.c. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); + + /* * Allow the master to continue. */ cpu_set(smp_processor_id(), cpu_online_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + unlock_ipi_call_lock(); + mb(); /* Wait for TSC sync to not schedule things before. @@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta return (send_status | accept_status); } +struct create_idle { + struct task_struct *idle; + struct completion done; + int cpu; +}; + +void do_fork_idle(void *_c_idle) +{ + struct create_idle *c_idle = _c_idle; + + c_idle->idle = fork_idle(c_idle->cpu); + complete(&c_idle->done); +} + /* * Boot one CPU. */ static int __cpuinit do_boot_cpu(int cpu, int apicid) { - struct task_struct *idle; unsigned long boot_error; int timeout; unsigned long start_rip; + struct create_idle c_idle = { + .cpu = cpu, + .done = COMPLETION_INITIALIZER(c_idle.done), + }; + DECLARE_WORK(work, do_fork_idle, &c_idle); + + c_idle.idle = get_idle_for_cpu(cpu); + + if (c_idle.idle) { + c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); + init_idle(c_idle.idle, cpu); + goto do_rest; + } + /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. + * During cold boot process, keventd thread is not spun up yet. + * When we do cpu hot-add, we create idle threads on the fly, we should + * not acquire any attributes from the calling context. Hence the clean + * way to create kernel_threads() is to do that from keventd(). + * We do the current_is_keventd() due to the fact that ACPI notifier + * was also queuing to keventd() and when the caller is already running + * in context of keventd(), we would end up with locking up the keventd + * thread. */ - idle = fork_idle(cpu); - if (IS_ERR(idle)) { + if (!keventd_up() || current_is_keventd()) + work.func(work.data); + else { + schedule_work(&work); + wait_for_completion(&c_idle.done); + } + + if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); - return PTR_ERR(idle); + return PTR_ERR(c_idle.idle); } - cpu_pda[cpu].pcurrent = idle; + set_idle_for_cpu(cpu, c_idle.idle); + +do_rest: + + cpu_pda[cpu].pcurrent = c_idle.idle; start_rip = setup_trampoline(); - init_rsp = idle->thread.rsp; + init_rsp = c_idle.idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; initial_code = start_secondary; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); + clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, start_rip, init_rsp); @@ -746,51 +863,6 @@ cycles_t cacheflush_time; unsigned long cache_decay_ticks; /* - * Construct cpu_sibling_map[], so that we can tell the sibling CPU - * on SMT systems efficiently. - */ -static __cpuinit void detect_siblings(void) -{ - int cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - for_each_online_cpu (cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (smp_num_siblings > 1) { - for_each_online_cpu (i) { - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - if (c->x86_num_cores > 1) { - for_each_online_cpu(i) { - if (phys_proc_id[cpu] == phys_proc_id[i]) - cpu_set(i, cpu_core_map[cpu]); - } - } else - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } -} - -/* * Cleanup possible dangling ends... */ static __cpuinit void smp_cleanup_boot(void) @@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void) * * RED-PEN audit/test this more. I bet there is more state messed up here. */ -static __cpuinit void disable_smp(void) +static __init void disable_smp(void) { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); @@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void) /* * Handle user cpus=... parameter. */ -static __cpuinit void enforce_max_cpus(unsigned max_cpus) +static __init void enforce_max_cpus(unsigned max_cpus) { int i, k; k = 0; @@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus) /* * Various sanity checks. */ -static int __cpuinit smp_sanity_check(unsigned max_cpus) +static int __init smp_sanity_check(unsigned max_cpus) { if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", @@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus) * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. */ -void __cpuinit smp_prepare_cpus(unsigned int max_cpus) +void __init smp_prepare_cpus(unsigned int max_cpus) { int i; @@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus) int apicid = cpu_present_to_apicid(i); if (physid_isset(apicid, phys_cpu_present_map)) { cpu_set(i, cpu_present_map); - /* possible map would be different if we supported real - CPU hotplug. */ cpu_set(i, cpu_possible_map); } + fixup_cpu_possible_map(i); } if (smp_sanity_check(max_cpus) < 0) { @@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void) int me = smp_processor_id(); cpu_set(me, cpu_online_map); cpu_set(me, cpu_callout_map); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); + per_cpu(cpu_state, me) = CPU_ONLINE; } /* * Entry point to boot a CPU. - * - * This is all __cpuinit, not __devinit for now because we don't support - * CPU hotplug (yet). */ int __cpuinit __cpu_up(unsigned int cpu) { @@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu) return -EINVAL; } + /* + * Already booted CPU? + */ + if (cpu_isset(cpu, cpu_callin_map)) { + Dprintk("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); if (err < 0) { @@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - return 0; + err = 0; + + return err; } /* * Finish the SMP boot. */ -void __cpuinit smp_cpus_done(unsigned int max_cpus) +void __init smp_cpus_done(unsigned int max_cpus) { +#ifndef CONFIG_HOTPLUG_CPU zap_low_mappings(); +#endif smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - detect_siblings(); time_init_gtod(); check_nmi_watchdog(); } + +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; +} + +void remove_cpu_from_maps(void) +{ + int cpu = smp_processor_id(); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ +} + +int __cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + disable_APIC_timer(); + + /* + * HACK: + * Allow any queued timer interrupts to get serviced + * This is only a temporary solution until we cleanup + * fixup_irqs as we do for IA64. + */ + local_irq_enable(); + mdelay(1); + + local_irq_disable(); + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + remove_cpu_from_maps(); + fixup_irqs(cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ + +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ebaa1e37d65..6c0f402e3a8 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt) */ asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("sldt %0" : "=m" (ctxt->ldt)); asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ @@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); + asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); } void save_processor_state(void) @@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); /* + * now restore the descriptor tables to their proper values + * ltr is done i fix_processor_context(). + */ + asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); + asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + + /* * segment registers */ asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds)); @@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); - /* - * now restore the descriptor tables to their proper values - * ltr is done i fix_processor_context(). - */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); - asm volatile ("lldt %0" :: "m" (ctxt->ldt)); - fix_processor_context(); do_fpu_end(); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index fb8c809b4cd..66bf6ddeb0c 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -64,6 +64,7 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ +static int hpet_use_timer; unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ unsigned long long monotonic_base; @@ -105,7 +106,9 @@ static inline unsigned int do_gettimeoffset_tsc(void) static inline unsigned int do_gettimeoffset_hpet(void) { - return ((hpet_readl(HPET_COUNTER) - vxtime.last) * vxtime.quot) >> 32; + /* cap counter read to one tick to avoid inconsistencies */ + unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; + return (min(counter,hpet_tick) * vxtime.quot) >> 32; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -301,7 +304,7 @@ unsigned long long monotonic_clock(void) last_offset = vxtime.last; base = monotonic_base; - this_offset = hpet_readl(HPET_T0_CMP) - hpet_tick; + this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); @@ -377,7 +380,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) write_seqlock(&xtime_lock); - if (vxtime.hpet_address) { + if (vxtime.hpet_address) + offset = hpet_readl(HPET_COUNTER); + + if (hpet_use_timer) { + /* if we're using the hpet timer functionality, + * we can more accurately know the counter value + * when the timer interrupt occured. + */ offset = hpet_readl(HPET_T0_CMP) - hpet_tick; delay = hpet_readl(HPET_COUNTER) - offset; } else { @@ -803,17 +813,18 @@ static int hpet_timer_stop_set_go(unsigned long tick) * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, * and period also hpet_tick. */ - - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ - + hpet_writel(hpet_tick, HPET_T0_CMP); + hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + cfg |= HPET_CFG_LEGACY; + } /* * Go! */ - cfg |= HPET_CFG_ENABLE | HPET_CFG_LEGACY; + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); return 0; @@ -834,8 +845,7 @@ static int hpet_init(void) id = hpet_readl(HPET_ID); - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER) || - !(id & HPET_ID_LEGSUP)) + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) return -1; hpet_period = hpet_readl(HPET_PERIOD); @@ -845,6 +855,8 @@ static int hpet_init(void) hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / hpet_period; + hpet_use_timer = (id & HPET_ID_LEGSUP); + return hpet_timer_stop_set_go(hpet_tick); } @@ -901,9 +913,11 @@ void __init time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) { + if (!hpet_init()) vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + + if (hpet_use_timer) { cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; #ifdef CONFIG_X86_PM_TIMER @@ -968,7 +982,7 @@ void __init time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; if (vxtime.hpet_address && notsc) { - timetype = "HPET"; + timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; vxtime.mode = VXTIME_HPET; do_gettimeoffset = do_gettimeoffset_hpet; @@ -983,7 +997,7 @@ void __init time_init_gtod(void) printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); #endif } else { - timetype = vxtime.hpet_address ? "HPET/TSC" : "PIT/TSC"; + timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 3dfec8fdabc..10273663000 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -274,7 +274,7 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = (regs->cs & 3) == 0; + int in_kernel = !user_mode(regs); unsigned long rsp; const int cpu = safe_smp_processor_id(); struct task_struct *cur = cpu_pda[cpu].pcurrent; @@ -318,7 +318,7 @@ void handle_BUG(struct pt_regs *regs) struct bug_frame f; char tmp; - if (regs->cs & 3) + if (user_mode(regs)) return; if (__copy_from_user(&f, (struct bug_frame *) regs->rip, sizeof(struct bug_frame))) @@ -437,7 +437,7 @@ static void do_trap(int trapnr, int signr, char *str, } #endif - if ((regs->cs & 3) != 0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, signr)) @@ -522,7 +522,7 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) } #endif - if ((regs->cs & 3)!=0) { + if (user_mode(regs)) { struct task_struct *tsk = current; if (exception_trace && unhandled_signal(tsk, SIGSEGV)) @@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) asmlinkage void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); /* Only the BSP gets external NMIs from the system. */ - if (!smp_processor_id()) + if (!cpu) reason = get_nmi_reason(); + if (!cpu_online(cpu)) + return; + if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) @@ -638,7 +644,7 @@ asmlinkage struct pt_regs *sync_regs(struct pt_regs *eregs) if (eregs == (struct pt_regs *)eregs->rsp) ; /* Exception from user space */ - else if (eregs->cs & 3) + else if (user_mode(eregs)) regs = ((struct pt_regs *)current->thread.rsp0) - 1; /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ @@ -669,7 +675,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) } #endif - asm("movq %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) @@ -697,7 +703,7 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) * allowing programs to debug themselves without the ptrace() * interface. */ - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; /* * Was the TF flag set by a debugger? If so, clear it now, @@ -715,13 +721,13 @@ asmlinkage void do_debug(struct pt_regs * regs, unsigned long error_code) info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - if ((regs->cs & 3) == 0) + if (!user_mode(regs)) goto clear_dr7; info.si_addr = (void __user *)regs->rip; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: - asm volatile("movq %0,%%db7"::"r"(0UL)); + set_debugreg(0UL, 7); return; clear_TF_reenable: @@ -756,7 +762,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) unsigned short cwd, swd; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel x87 math error")) return; @@ -822,7 +828,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) unsigned short mxcsr; conditional_sti(regs); - if ((regs->cs & 3) == 0 && + if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error")) return; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 59ebd5beda8..73389f51c4e 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -2,7 +2,10 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ +#define LOAD_OFFSET __START_KERNEL_map + #include <asm-generic/vmlinux.lds.h> +#include <asm/page.h> #include <linux/config.h> OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") @@ -11,28 +14,30 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; SECTIONS { - . = 0xffffffff80100000; + . = __START_KERNEL; phys_startup_64 = startup_64 - LOAD_OFFSET; _text = .; /* Text and read-only data */ - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { *(.text) SCHED_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 - .text.lock : { *(.text.lock) } /* out-of-line lock text */ + /* out-of-line lock text */ + .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } _etext = .; /* End of text section */ . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : { *(__ex_table) } + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; RODATA - .data : { /* Data */ + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS } @@ -40,62 +45,95 @@ SECTIONS _edata = .; /* End of data section */ __bss_start = .; /* BSS */ - .bss : { + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { *(.bss.page_aligned) *(.bss) } __bss_end = .; + . = ALIGN(PAGE_SIZE); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) -#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16) -#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1)) -#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES) + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + __vsyscall_0 = VSYSCALL_VIRT_ADDR; - .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) } - __vsyscall_0 = LOADADDR(.vsyscall_0); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) } - xtime_lock = LOADADDR(.xtime_lock); - .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) } - vxtime = LOADADDR(.vxtime); - .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) } - wall_jiffies = LOADADDR(.wall_jiffies); - .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) } - sys_tz = LOADADDR(.sys_tz); - .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) } - sysctl_vsyscall = LOADADDR(.sysctl_vsyscall); - .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) } - xtime = LOADADDR(.xtime); + .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } + xtime_lock = VVIRT(.xtime_lock); + + .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } + vxtime = VVIRT(.vxtime); + + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } + wall_jiffies = VVIRT(.wall_jiffies); + + .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } + sys_tz = VVIRT(.sys_tz); + + .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } + sysctl_vsyscall = VVIRT(.sysctl_vsyscall); + + .xtime : AT(VLOAD(.xtime)) { *(.xtime) } + xtime = VVIRT(.xtime); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) } - jiffies = LOADADDR(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) } - . = LOADADDR(.vsyscall_0) + 4096; + .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } + jiffies = VVIRT(.jiffies); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + + . = VSYSCALL_VIRT_ADDR + 4096; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } . = ALIGN(4096); - .data.page_aligned : { *(.data.page_aligned) } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + } . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : { + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; *(.init.text) _einittext = .; } __initdata_begin = .; - .init.data : { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } __initdata_end = .; . = ALIGN(16); __setup_start = .; - .init.setup : { *(.init.setup) } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : { + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -106,32 +144,38 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + *(.con_initcall.init) + } __con_initcall_end = .; SECURITY_INIT . = ALIGN(8); __alt_instructions = .; - .altinstructions : { *(.altinstructions) } + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + *(.altinstructions) + } __alt_instructions_end = .; - .altinstr_replacement : { *(.altinstr_replacement) } + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } - .exit.data : { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; . = ALIGN(4096); __nosave_begin = .; - .data_nosave : { *(.data.nosave) } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index aed61a668a1..33a873a3c22 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -12,6 +12,7 @@ #include <linux/sched.h> #include <linux/delay.h> #include <asm/delay.h> +#include <asm/msr.h> #ifdef CONFIG_SMP #include <asm/smp.h> @@ -19,6 +20,12 @@ int x86_udelay_tsc = 0; /* Delay via TSC */ +int read_current_timer(unsigned long *timer_value) +{ + rdtscll(*timer_value); + return 0; +} + void __delay(unsigned long loops) { unsigned bclock, now; diff --git a/arch/x86_64/mm/Makefile b/arch/x86_64/mm/Makefile index 66c354ad80c..1d232a87f11 100644 --- a/arch/x86_64/mm/Makefile +++ b/arch/x86_64/mm/Makefile @@ -4,7 +4,7 @@ obj-y := init.o fault.o ioremap.o extable.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_DISCONTIGMEM) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o obj-$(CONFIG_ACPI_NUMA) += srat.o diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 57d3ab15a5c..2f187986f94 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -74,7 +74,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, instr = (unsigned char *)convert_rip_to_linear(current, regs); max_instr = instr + 15; - if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; while (scan_more && instr < max_instr) { @@ -106,7 +106,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, /* Could check the LDT for lm, but for now it's good enough to assume that long mode only uses well known segments or kernel. */ - scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); break; case 0x60: diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index dbe53b4c7e6..72e4b364ed7 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -318,7 +318,7 @@ void zap_low_mappings(void) flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NUMA void __init paging_init(void) { { @@ -427,13 +427,16 @@ void __init mem_init(void) reservedpages = 0; /* this will put all low memory onto the freelists */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA totalram_pages += numa_free_all_bootmem(); tmp = 0; /* should count reserved pages here for all nodes */ #else + +#ifdef CONFIG_FLATMEM max_mapnr = end_pfn; if (!mem_map) BUG(); +#endif totalram_pages += free_all_bootmem(); @@ -515,7 +518,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_DISCONTIGMEM +#ifdef CONFIG_NUMA int nid = phys_to_nid(phys); reserve_bootmem_node(NODE_DATA(nid), phys, len); #else diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 58aac23760e..6972df480d2 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -178,7 +178,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) return (__force void __iomem *)phys_to_virt(phys_addr); -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * Don't allow anybody to remap normal RAM that we're using.. */ diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index fd9f25d7a6c..ac61c186eb0 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -66,6 +66,13 @@ int __init compute_hash_shift(struct node *nodes, int numnodes) return -1; } +#ifdef CONFIG_SPARSEMEM +int early_pfn_to_nid(unsigned long pfn) +{ + return phys_to_nid(pfn << PAGE_SHIFT); +} +#endif + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { @@ -80,6 +87,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; + memory_present(nodeid, start_pfn, end_pfn); nodedata_phys = find_e820_area(start, end, pgdat_size); if (nodedata_phys == -1L) panic("Cannot find memory pgdat in node %d\n", nodeid); @@ -243,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } -__init void numa_add_cpu(int cpu) +__cpuinit void numa_add_cpu(int cpu) { /* BP is initialized elsewhere */ if (cpu) diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 62349c78db5..7e7d0c2a002 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -53,25 +53,11 @@ fill_mp_bus_to_cpumask(void) for (j = SECONDARY_LDT_BUS_NUMBER(ldtbus); j <= SUBORDINATE_LDT_BUS_NUMBER(ldtbus); j++) - pci_bus_to_cpumask[j] = - node_to_cpumask(NODE_ID(nid)); + pci_bus_to_node[j] = NODE_ID(nid); } } } - /* quick sanity check */ - printed = 0; - for (i = 0; i < 256; i++) { - if (cpus_empty(pci_bus_to_cpumask[i])) { - pci_bus_to_cpumask[i] = CPU_MASK_ALL; - if (printed) - continue; - printk(KERN_ERR - "k8-bus.c: some busses have empty cpu mask\n"); - printed = 1; - } - } - return 0; } |