diff options
Diffstat (limited to 'arch/x86')
278 files changed, 10124 insertions, 6835 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..862adb9bf0d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,8 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE @@ -29,11 +31,14 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT config ARCH_DEFCONFIG string @@ -87,6 +92,10 @@ config GENERIC_IOMAP config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y @@ -238,25 +247,39 @@ config X86_HAS_BOOT_CPU_ID def_bool y depends on X86_VOYAGER +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + help + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. + + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +config NUMA_MIGRATE_IRQ_DESC + bool "Move irq desc when changing irq smp_affinity" + depends on SPARSE_IRQ && NUMA + default n + help + This enables moving irq_desc to cpu/node that irq will use handled. + + If you don't know what to do here, say N. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -endif - -if !ACPI -config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC -endif choice prompt "Subarchitecture Type" @@ -367,10 +390,10 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" - depends on X86_32 + depends on X86 help Calculate simpler /proc/<PID>/wchan values. If this option is disabled then wchan values will recurse back to the @@ -465,10 +488,6 @@ config X86_CYCLONE_TIMER def_bool y depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC - def_bool y - depends on SMP && X86_ES7000 && MPENTIUMIII - source "arch/x86/Kconfig.cpu" config HPET_TIMER @@ -482,7 +501,7 @@ config HPET_TIMER The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - <http://www.intel.com/hardwaredesign/hpetspec.htm>. + <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>. You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. @@ -567,9 +586,19 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_STATS + bool "Export AMD IOMMU statistics to debugfs" + depends on AMD_IOMMU + select DEBUG_FS + help + This option enables code in the AMD IOMMU driver to collect various + statistics about whats happening in the driver and exports that + information to userspace via debugfs. + If unsure, say N. + # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation @@ -580,21 +609,25 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) +config IOMMU_API + def_bool (AMD_IOMMU || DMAR) + config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP && BROKEN + depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL + select CPUMASK_OFFSTACK default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. config NR_CPUS - int "Maximum number of CPUs (2-512)" if !MAXSMP - range 2 512 - depends on SMP + int "Maximum number of CPUs" if SMP && !MAXSMP + range 2 512 if SMP && !MAXSMP + default "1" if !SMP default "4096" if MAXSMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" + default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "8" if SMP help This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 512 and the @@ -660,6 +693,30 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER @@ -956,24 +1013,37 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core i7 + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) @@ -1493,6 +1563,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA @@ -1632,13 +1706,6 @@ config APM_ALLOW_INTS many of the newer IBM Thinkpads. If you experience hangs when you suspend, try setting this to Y. Otherwise, say N. -config APM_REAL_MODE_POWER_OFF - bool "Use real mode APM BIOS call to power off" - help - Use real mode APM BIOS calls to switch off the computer. This is - a work-around for a number of buggy BIOSes. Switch this option on if - your computer crashes instead of powering off properly. - endif # APM source "arch/x86/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe37..8078955845a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -408,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EMBEDDED @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32 config X86_DS def_bool X86_PTRACE_BTS depends on X86_DEBUGCTLMSR + select HAVE_HW_BRANCH_TRACER config X86_PTRACE_BTS bool "Branch Trace Store" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..10d6cc3fd05 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -114,18 +114,6 @@ config DEBUG_RODATA data. This is recommended so that we can catch kernel bugs sooner. If in doubt, say "Y". -config DIRECT_GBPAGES - bool "Enable gbpages-mapped kernel pagetables" - depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 - help - Enable gigabyte pages support (if the CPU supports it). This can - improve the kernel's performance a tiny bit by reducing TLB - pressure. - - This is experimental code. - - If in doubt, say "N". - config DEBUG_RODATA_TEST bool "Testcase for the DEBUG_RODATA feature" depends on DEBUG_RODATA @@ -186,14 +174,10 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE_HOOKS - bool - config MMIOTRACE bool "Memory mapped IO tracing" depends on DEBUG_KERNEL && PCI select TRACING - select MMIOTRACE_HOOKS help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap @@ -307,10 +291,10 @@ config OPTIMIZE_INLINING developers have marked 'inline'. Doing so takes away freedom from gcc to do what it thinks is best, which is desirable for the gcc 3.x series of compilers. The gcc 4.x series have a rewritten inlining algorithm and - disabling this option will generate a smaller kernel there. Hopefully - this algorithm is so good that allowing gcc4 to make the decision can - become the default in the future, until then this option is there to - test gcc for this. + enabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc 4.x and above to make the + decision will become the default in the future. Until then this option + is there to test gcc for this. If unsure, say N. diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index b939cb476de..5d4742ed4aa 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -34,7 +34,7 @@ static struct mode_info cga_modes[] = { { VIDEO_80x25, 80, 25, 0 }, }; -__videocard video_vga; +static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) @@ -259,7 +259,7 @@ static int vga_probe(void) return mode_count[adapter]; } -__videocard video_vga = { +static __videocard video_vga = { .card_name = "VGA", .probe = vga_probe, .set_mode = vga_set_mode, diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 83598b23093..3bef2c1febe 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -226,7 +226,7 @@ static unsigned int mode_menu(void) #ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ -struct saved_screen { +static struct saved_screen { int x, y; int curx, cury; u16 *data; diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 13b8c86ae98..b30a08ed8eb 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y @@ -298,7 +298,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f0a03d7a7d6..0e7dbc0a3e4 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -77,7 +77,7 @@ CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y CONFIG_AUDIT_TREE=y # CONFIG_IKCONFIG is not set -CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y # CONFIG_CGROUP_DEBUG is not set CONFIG_CGROUP_NS=y @@ -298,7 +298,7 @@ CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y +# CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x200000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c index 070afc5b6c9..b9d00261703 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel.c @@ -6,13 +6,22 @@ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual * Volume 2A: Instruction Set Reference, A-M * - * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com> - * Copyright (c) 2008 Kent Liu <kent.liu@intel.com> + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang <austin_zhang@linux.intel.com> + * Kent Liu <kent.liu@intel.com> * * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * */ #include <linux/init.h> @@ -75,99 +84,92 @@ static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len * If your algorithm starts with ~0, then XOR with ~0 before you set * the seed. */ -static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, +static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, unsigned int keylen) { - u32 *mctx = crypto_ahash_ctx(hash); + u32 *mctx = crypto_shash_ctx(hash); if (keylen != sizeof(u32)) { - crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } *mctx = le32_to_cpup((__le32 *)key); return 0; } -static int crc32c_intel_init(struct ahash_request *req) +static int crc32c_intel_init(struct shash_desc *desc) { - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 *crcp = ahash_request_ctx(req); + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); *crcp = *mctx; return 0; } -static int crc32c_intel_update(struct ahash_request *req) +static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct crypto_hash_walk walk; - u32 *crcp = ahash_request_ctx(req); - u32 crc = *crcp; - int nbytes; - - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); + u32 *crcp = shash_desc_ctx(desc); - *crcp = crc; + *crcp = crc32c_intel_le_hw(*crcp, data, len); return 0; } -static int crc32c_intel_final(struct ahash_request *req) +static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) { - u32 *crcp = ahash_request_ctx(req); - - *(__le32 *)req->result = ~cpu_to_le32p(crcp); + *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); return 0; } -static int crc32c_intel_digest(struct ahash_request *req) +static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct crypto_hash_walk walk; - u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); - u32 crc = *mctx; - int nbytes; + return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); +} - for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; - nbytes = crypto_hash_walk_done(&walk, 0)) - crc = crc32c_intel_le_hw(crc, walk.data, nbytes); +static int crc32c_intel_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); - *(__le32 *)req->result = ~cpu_to_le32(crc); + *(__le32 *)out = ~cpu_to_le32p(crcp); return 0; } +static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + static int crc32c_intel_cra_init(struct crypto_tfm *tfm) { u32 *key = crypto_tfm_ctx(tfm); *key = ~0; - tfm->crt_ahash.reqsize = sizeof(u32); - return 0; } -static struct crypto_alg alg = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_AHASH, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_alignmask = 3, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(alg.cra_list), - .cra_init = crc32c_intel_cra_init, - .cra_type = &crypto_ahash_type, - .cra_u = { - .ahash = { - .digestsize = CHKSUM_DIGEST_SIZE, - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .digest = crc32c_intel_digest, - } +static struct shash_alg alg = { + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .finup = crc32c_intel_finup, + .digest = crc32c_intel_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_intel_cra_init, } }; @@ -175,14 +177,14 @@ static struct crypto_alg alg = { static int __init crc32c_intel_mod_init(void) { if (cpu_has_xmm4_2) - return crypto_register_alg(&alg); + return crypto_register_shash(&alg); else return -ENODEV; } static void __exit crc32c_intel_mod_fini(void) { - crypto_unregister_alg(&alg); + crypto_unregister_shash(&alg); } module_init(crc32c_intel_mod_init); @@ -194,4 +196,3 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS("crc32c"); MODULE_ALIAS("crc32c-intel"); - diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 127ec3f0721..2a4d073d2cf 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -327,7 +327,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->cached_hole_size = 0; current->mm->mmap = NULL; - compute_creds(bprm); + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 4bc02b23674..9dabd00e980 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,13 +24,14 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> -#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/ia32_unistd.h> #include <asm/user32.h> #include <asm/sigcontext32.h> #include <asm/proto.h> #include <asm/vdso.h> +#include <asm/sigframe.h> +#include <asm/sys_ia32.h> #define DEBUG_SIG 0 @@ -41,7 +42,6 @@ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ X86_EFLAGS_CF) -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) @@ -173,47 +173,28 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, /* * Do a signal return; undo the signal stack. */ +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} -struct sigframe -{ - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe -{ - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - char retcode[8]; - /* fp state follows here */ -}; - -#define COPY(x) { \ - unsigned int reg; \ - err |= __get_user(reg, &sc->x); \ - regs->x = reg; \ +#define COPY_SEG_CPL3(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ } -#define RELOAD_SEG(seg,mask) \ - { unsigned int cur; \ - unsigned short pre; \ - err |= __get_user(pre, &sc->seg); \ - savesegment(seg, cur); \ - pre |= mask; \ - if (pre != cur) loadsegment(seg, pre); } +#define RELOAD_SEG(seg) { \ + unsigned int cur, pre; \ + err |= __get_user(pre, &sc->seg); \ + savesegment(seg, cur); \ + pre |= 3; \ + if (pre != cur) \ + loadsegment(seg, pre); \ +} static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, - unsigned int *peax) + unsigned int *pax) { unsigned int tmpflags, gs, oldgs, err = 0; void __user *buf; @@ -240,18 +221,16 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, if (gs != oldgs) load_gs_index(gs); - RELOAD_SEG(fs, 3); - RELOAD_SEG(ds, 3); - RELOAD_SEG(es, 3); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); /* Don't touch extended registers */ - err |= __get_user(regs->cs, &sc->cs); - regs->cs |= 3; - err |= __get_user(regs->ss, &sc->ss); - regs->ss |= 3; + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); err |= __get_user(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -262,15 +241,13 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); err |= restore_i387_xstate_ia32(buf); - err |= __get_user(tmp, &sc->ax); - *peax = tmp; - + err |= __get_user(*pax, &sc->ax); return err; } asmlinkage long sys32_sigreturn(struct pt_regs *regs) { - struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); + struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; unsigned int ax; @@ -300,12 +277,12 @@ badframe: asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; sigset_t set; unsigned int ax; struct pt_regs tregs; - frame = (struct rt_sigframe __user *)(regs->sp - 4); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -359,20 +336,15 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, err |= __put_user(regs->dx, &sc->dx); err |= __put_user(regs->cx, &sc->cx); err |= __put_user(regs->ax, &sc->ax); - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); err |= __put_user(regs->flags, &sc->flags); err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); - tmp = save_i387_xstate_ia32(fpstate); - if (tmp < 0) - err = -EFAULT; - else - err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), - &sc->fpstate); + err |= __put_user(ptr_to_compat(fpstate), &sc->fpstate); /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -400,7 +372,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, } /* This is the legacy signal stack switching. */ - else if ((regs->ss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER32_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) sp = (unsigned long) ka->sa.sa_restorer; @@ -408,6 +380,8 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, if (used_math()) { sp = sp - sig_xstate_ia32_size; *fpstate = (struct _fpstate_ia32 *) sp; + if (save_i387_xstate_ia32(*fpstate) < 0) + return (void __user *) -1L; } sp -= frame_size; @@ -420,7 +394,7 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, int ia32_setup_frame(int sig, struct k_sigaction *ka, compat_sigset_t *set, struct pt_regs *regs) { - struct sigframe __user *frame; + struct sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -430,12 +404,10 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, u16 poplmovl; u32 val; u16 int80; - u16 pad; } __attribute__((packed)) code = { 0xb858, /* popl %eax ; movl $...,%eax */ __NR_ia32_sigreturn, 0x80cd, /* int $0x80 */ - 0, }; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -471,7 +443,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, * These are actually not used anymore, but left because some * gdb versions depend on them as a marker. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -501,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, compat_sigset_t *set, struct pt_regs *regs) { - struct rt_sigframe __user *frame; + struct rt_sigframe_ia32 __user *frame; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -511,8 +483,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, u8 movl; u32 val; u16 int80; - u16 pad; - u8 pad2; + u8 pad; } __attribute__((packed)) code = { 0xb8, __NR_ia32_rt_sigreturn, @@ -559,7 +530,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * Not actually used anymore, but left because some gdb * versions need it. */ - err |= __copy_to_user(frame->retcode, &code, 8); + err |= __put_user(*((u64 *)&code), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -572,11 +543,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - /* Make -mregparm=3 work */ - regs->ax = sig; - regs->dx = (unsigned long) &frame->info; - regs->cx = (unsigned long) &frame->uc; - loadsegment(ds, __USER32_DS); loadsegment(es, __USER32_DS); diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index d21991ce606..29cdcd02ead 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -8,6 +8,7 @@ #include <linux/shm.h> #include <linux/ipc.h> #include <linux/compat.h> +#include <asm/sys_ia32.h> asmlinkage long sys32_ipc(u32 call, int first, int second, int third, compat_uptr_t ptr, u32 fifth) diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 2e09dcd3c0a..6c0d7f6231a 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -44,8 +44,8 @@ #include <asm/types.h> #include <asm/uaccess.h> #include <asm/atomic.h> -#include <asm/ia32.h> #include <asm/vgtod.h> +#include <asm/sys_ia32.h> #define AA(__x) ((unsigned long)(__x)) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 1a30c0440c6..95c8cd9d22b 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -190,16 +190,23 @@ /* FIXME: move this macro to <linux/pci.h> */ #define PCI_BUS(x) (((x) >> 8) & 0xff) +/* Protection domain flags */ +#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ +#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops + domain for an IOMMU */ + /* * This structure contains generic data for IOMMU protection domains * independent of their use. */ struct protection_domain { - spinlock_t lock; /* mostly used to lock the page table*/ - u16 id; /* the domain id written to the device table */ - int mode; /* paging mode (0-6 levels) */ - u64 *pt_root; /* page table root pointer */ - void *priv; /* private data */ + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + unsigned long flags; /* flags to find out type of domain */ + unsigned dev_cnt; /* devices assigned to this domain */ + void *priv; /* private data */ }; /* @@ -251,13 +258,6 @@ struct amd_iommu { /* Pointer to PCI device of this IOMMU */ struct pci_dev *dev; - /* - * Capability pointer. There could be more than one IOMMU per PCI - * device function if there are more than one AMD IOMMU capability - * pointers. - */ - u16 cap_ptr; - /* physical address of MMIO space */ u64 mmio_phys; /* virtual address of MMIO space */ @@ -266,6 +266,13 @@ struct amd_iommu { /* capabilities of that IOMMU read from ACPI */ u32 cap; + /* + * Capability pointer. There could be more than one IOMMU per PCI + * device function if there are more than one AMD IOMMU capability + * pointers. + */ + u16 cap_ptr; + /* pci domain of this IOMMU */ u16 pci_seg; @@ -284,19 +291,19 @@ struct amd_iommu { /* size of command buffer */ u32 cmd_buf_size; - /* event buffer virtual address */ - u8 *evt_buf; /* size of event buffer */ u32 evt_buf_size; + /* event buffer virtual address */ + u8 *evt_buf; /* MSI number for event interrupt */ u16 evt_msi_num; - /* if one, we need to send a completion wait command */ - int need_sync; - /* true if interrupts for this IOMMU are already enabled */ bool int_enabled; + /* if one, we need to send a completion wait command */ + bool need_sync; + /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; }; @@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table; extern unsigned long *amd_iommu_pd_alloc_bitmap; /* will be 1 if device isolation is enabled */ -extern int amd_iommu_isolate; +extern bool amd_iommu_isolate; /* * If true, the addresses will be flushed on unmap time, not when @@ -382,18 +389,6 @@ extern int amd_iommu_isolate; */ extern bool amd_iommu_unmap_flush; -/* takes a PCI device id and prints it out in a readable form */ -static inline void print_devid(u16 devid, int nl) -{ - int bus = devid >> 8; - int dev = devid >> 3 & 0x1f; - int fn = devid & 0x07; - - printk("%02x:%02x.%x", bus, dev, fn); - if (nl) - printk("\n"); -} - /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) @@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn) return (((u16)bus) << 8) | devfn; } +#ifdef CONFIG_AMD_IOMMU_STATS + +struct __iommu_counter { + char *name; + struct dentry *dent; + u64 value; +}; + +#define DECLARE_STATS_COUNTER(nm) \ + static struct __iommu_counter nm = { \ + .name = #nm, \ + } + +#define INC_STATS_COUNTER(name) name.value += 1 +#define ADD_STATS_COUNTER(name, x) name.value += (x) +#define SUB_STATS_COUNTER(name, x) name.value -= (x) + +#else /* CONFIG_AMD_IOMMU_STATS */ + +#define DECLARE_STATS_COUNTER(name) +#define INC_STATS_COUNTER(name) +#define ADD_STATS_COUNTER(name, x) +#define SUB_STATS_COUNTER(name, x) + +static inline void amd_iommu_stats_init(void) { } + +#endif /* CONFIG_AMD_IOMMU_STATS */ + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3b1510b4fc5..ab1d51a8855 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -54,7 +54,6 @@ extern int disable_apic; extern int is_vsmp_box(void); extern void xapic_wait_icr_idle(void); extern u32 safe_xapic_wait_icr_idle(void); -extern u64 xapic_icr_read(void); extern void xapic_icr_write(u32, u32); extern int setup_profiling_timer(unsigned int); @@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg) } #ifndef CONFIG_X86_32 -extern int x2apic, x2apic_preenabled; +extern int x2apic; extern void check_x2apic(void); extern void enable_x2apic(void); extern void enable_IR_x2apic(void); @@ -193,6 +192,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index 1d9543b9d35..d8dd9f53791 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -9,12 +9,12 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return &cpu_online_map; #else - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); #endif } @@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target proc */ #define NO_BALANCE_IRQ (0) -#define WAKE_SECONDARY_VIA_INIT - static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -81,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS) + if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); return BAD_APICID; @@ -96,7 +94,7 @@ extern u8 cpu_2_logical_apicid[]; /* Mapping from cpu number to logical apicid */ static inline int cpu_to_logical_apicid(int cpu) { - if (cpu >= NR_CPUS) + if (cpu >= nr_cpu_ids) return BAD_APICID; return cpu_physical_id(cpu); } @@ -121,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) } /* As we are using single CPU as destination, pick only one CPU here */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int cpu; int apicid; - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return cpu_to_logical_apicid(cpu); + + return BAD_APICID; +} + static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h index 9404c535b7e..27fcd01b3ae 100644 --- a/arch/x86/include/asm/bigsmp/ipi.h +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_MACH_IPI_H #define __ASM_MACH_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 36001032271..9fa9dcdf344 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -168,7 +168,15 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) */ static inline void change_bit(int nr, volatile unsigned long *addr) { - asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "xorb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btc %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } } /** diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 3def2065fce..d9cf1cd156d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_32 # define __BUG_C0 "2:\t.long 1b, %c0\n" #else -# define __BUG_C0 "2:\t.quad 1b, %c0\n" +# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" #endif #define BUG() \ diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h index e02ae2d89ac..f110ad417df 100644 --- a/arch/x86/include/asm/byteorder.h +++ b/arch/x86/include/asm/byteorder.h @@ -4,26 +4,33 @@ #include <asm/types.h> #include <linux/compiler.h> -#ifdef __GNUC__ +#define __LITTLE_ENDIAN -#ifdef __i386__ - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) +static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (x) : "0" (x)); -#else +#ifdef __i386__ +# ifdef CONFIG_X86_BSWAP + asm("bswap %0" : "=r" (val) : "0" (val)); +# else asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ "rorl $16,%0\n\t" /* swap words */ "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (x) - : "0" (x)); + : "=q" (val) + : "0" (val)); +# endif + +#else /* __i386__ */ + asm("bswapl %0" + : "=r" (val) + : "0" (val)); #endif - return x; + return val; } +#define __arch_swab32 __arch_swab32 -static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) +static inline __attribute_const__ __u64 __arch_swab64(__u64 val) { +#ifdef __i386__ union { struct { __u32 a; @@ -32,50 +39,27 @@ static inline __attribute_const__ __u64 ___arch__swab64(__u64 val) __u64 u; } v; v.u = val; -#ifdef CONFIG_X86_BSWAP +# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#else - v.s.a = ___arch__swab32(v.s.a); - v.s.b = ___arch__swab32(v.s.b); +# else + v.s.a = __arch_swab32(v.s.a); + v.s.b = __arch_swab32(v.s.b); asm("xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -#endif +# endif return v.u; -} - #else /* __i386__ */ - -static inline __attribute_const__ __u64 ___arch__swab64(__u64 x) -{ asm("bswapq %0" - : "=r" (x) - : "0" (x)); - return x; -} - -static inline __attribute_const__ __u32 ___arch__swab32(__u32 x) -{ - asm("bswapl %0" - : "=r" (x) - : "0" (x)); - return x; -} - + : "=r" (val) + : "0" (val)); + return val; #endif +} +#define __arch_swab64 __arch_swab64 -/* Do not define swab16. Gcc is smart enough to recognize "C" version and - convert it into rotation or exhange. */ - -#define __arch__swab64(x) ___arch__swab64(x) -#define __arch__swab32(x) ___arch__swab32(x) - -#define __BYTEORDER_HAS_U64__ - -#endif /* __GNUC__ */ - -#include <linux/byteorder/little_endian.h> +#include <linux/byteorder.h> #endif /* _ASM_X86_BYTEORDER_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index cfdf8c2c5c3..ea408dcba51 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -80,7 +80,6 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ #define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ #define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */ @@ -92,6 +91,8 @@ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -117,6 +118,7 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ #define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ @@ -237,6 +239,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e6b82b17b07..dc27705f544 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr) _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); } -#define SYS_VECTOR_FREE 0 -#define SYS_VECTOR_ALLOCED 1 - extern int first_system_vector; -extern char system_vectors[]; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; static inline void alloc_system_vector(int vector) { - if (system_vectors[vector] == SYS_VECTOR_FREE) { - system_vectors[vector] = SYS_VECTOR_ALLOCED; + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); if (first_system_vector > vector) first_system_vector = vector; } else diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 7f225a4b2a2..4035357f5b9 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -65,21 +65,17 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) return dma_ops; else return dev->archdata.dma_ops; -#endif /* _ASM_X86_DMA_MAPPING_H */ +#endif } /* Make sure we keep the same behaviour */ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { -#ifdef CONFIG_X86_32 - return 0; -#else struct dma_mapping_ops *ops = get_dma_ops(dev); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); return (dma_addr == bad_dma_address); -#endif } #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a95008457ea..a8f672ba100 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -6,14 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - DS and BTS hardware configuration + * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -26,11 +25,51 @@ #include <linux/types.h> #include <linux/init.h> +#include <linux/err.h> #ifdef CONFIG_X86_DS struct task_struct; +struct ds_context; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); + + +/* + * A list of features plus corresponding macros to talk about them in + * the ds_request function's flags parameter. + * + * We use the enum to index an array of corresponding control bits; + * we use the macro to index a flags bit-vector. + */ +enum ds_feature { + dsf_bts = 0, + dsf_bts_kernel, +#define BTS_KERNEL (1 << dsf_bts_kernel) + /* trace kernel-mode branches */ + + dsf_bts_user, +#define BTS_USER (1 << dsf_bts_user) + /* trace user-mode branches */ + + dsf_bts_overflow, + dsf_bts_max, + dsf_pebs = dsf_bts_max, + + dsf_pebs_max, + dsf_ctl_max = dsf_pebs_max, + dsf_bts_timestamps = dsf_ctl_max, +#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) + /* add timestamps into BTS trace */ + +#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) +}; + /* * Request BTS or PEBS @@ -38,163 +77,169 @@ struct task_struct; * Due to alignement constraints, the actual buffer may be slightly * smaller than the requested or provided buffer. * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism. * * task: the task to request recording for; * NULL for per-cpu recording on the current cpu * base: the base pointer for the (non-pageable) buffer; - * NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; * NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + * -1 if no interrupt threshold is requested. + * flags: a bit-mask of the above flags */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); /* * Release BTS or PEBS resources + * Suspend and resume BTS or PEBS tracing * - * Frees buffers allocated on ds_request. - * - * Returns 0 on success; -Eerrno otherwise - * - * task: the task to release resources for; - * NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern void ds_release_bts(struct bts_tracer *tracer); +extern void ds_suspend_bts(struct bts_tracer *tracer); +extern void ds_resume_bts(struct bts_tracer *tracer); +extern void ds_release_pebs(struct pebs_tracer *tracer); +extern void ds_suspend_pebs(struct pebs_tracer *tracer); +extern void ds_resume_pebs(struct pebs_tracer *tracer); -/* - * Return the (array) index of the write pointer. - * (assuming an array of BTS/PEBS records) - * - * Returns -Eerrno on error - * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result - */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); /* - * Return the (array) index one record beyond the end of the array. - * (assuming an array of BTS/PEBS records) + * The raw DS buffer state as it is used for BTS and PEBS recording. * - * Returns -Eerrno on error - * - * task: the task to access; - * NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * This is the low-level, arch-dependent interface for working + * directly on the raw trace data. */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +struct ds_trace { + /* the number of bts/pebs records */ + size_t n; + /* the size of a bts/pebs record in bytes */ + size_t size; + /* pointers into the raw buffer: + - to the first entry */ + void *begin; + /* - one beyond the last entry */ + void *end; + /* - one beyond the newest entry */ + void *top; + /* - the interrupt threshold */ + void *ith; + /* flags given on ds_request() */ + unsigned int flags; +}; /* - * Provide a pointer to the BTS/PEBS record at parameter index. - * (assuming an array of BTS/PEBS records) - * - * The pointer points directly into the buffer. The user is - * responsible for copying the record. - * - * Returns the size of a single record on success; -Eerrno on error - * - * task: the task to access; - * NULL to access the current cpu - * index: the index of the requested record - * record (out): pointer to the requested record + * An arch-independent view on branch trace data. */ -extern int ds_access_bts(struct task_struct *task, - size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, - size_t index, const void **record); +enum bts_qualifier { + bts_invalid, +#define BTS_INVALID bts_invalid + + bts_branch, +#define BTS_BRANCH bts_branch + + bts_task_arrives, +#define BTS_TASK_ARRIVES bts_task_arrives + + bts_task_departs, +#define BTS_TASK_DEPARTS bts_task_departs + + bts_qual_bit_size = 4, + bts_qual_max = (1 << bts_qual_bit_size), +}; + +struct bts_struct { + __u64 qualifier; + union { + /* BTS_BRANCH */ + struct { + __u64 from; + __u64 to; + } lbr; + /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ + struct { + __u64 jiffies; + pid_t pid; + } timestamp; + } variant; +}; -/* - * Write one or more BTS/PEBS records at the write pointer index and - * advance the write pointer. - * - * If size is not a multiple of the record size, trailing bytes are - * zeroed out. - * - * May result in one or more overflow notifications. - * - * If called during overflow handling, that is, with index >= - * interrupt threshold, the write will wrap around. - * - * An overflow notification is given if and when the interrupt - * threshold is reached during or after the write. - * - * Returns the number of bytes written or -Eerrno. - * - * task: the task to access; - * NULL to access the current cpu - * buffer: the buffer to write - * size: the size of the buffer - */ -extern int ds_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, - const void *buffer, size_t size); /* - * Same as ds_write_bts/pebs, but omit ownership checks. + * The BTS state. * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. + * This gives access to the raw DS state and adds functions to provide + * an arch-independent view of the BTS data. */ -extern int ds_unchecked_write_bts(struct task_struct *task, - const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, - const void *buffer, size_t size); +struct bts_trace { + struct ds_trace ds; + + int (*read)(struct bts_tracer *tracer, const void *at, + struct bts_struct *out); + int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); +}; + /* - * Reset the write pointer of the BTS/PEBS buffer. + * The PEBS state. * - * Returns 0 on success; -Eerrno on error - * - * task: the task to access; - * NULL to access the current cpu + * This gives access to the raw DS state and the PEBS-specific counter + * reset value. */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +struct pebs_trace { + struct ds_trace ds; + + /* the PEBS reset value */ + unsigned long long reset_value; +}; + /* - * Clear the BTS/PEBS buffer and reset the write pointer. - * The entire buffer will be zeroed out. + * Read the BTS or PEBS trace. * - * Returns 0 on success; -Eerrno on error + * Returns a view on the trace collected for the parameter tracer. + * + * The view remains valid as long as the traced task is not running or + * the tracer is suspended. + * Writes into the trace buffer are not reflected. * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); +extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); + /* - * Provide the PEBS counter reset value. + * Reset the write pointer of the BTS/PEBS buffer. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu - * value (out): the counter reset value + * tracer: the tracer handle returned from ds_request_~() */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer); /* * Set the PEBS counter reset value. * * Returns 0 on success; -Eerrno on error * - * task: the task to access; - * NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs() * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); /* * Initialization @@ -202,39 +247,26 @@ extern int ds_set_pebs_reset(struct task_struct *task, u64 value); struct cpuinfo_x86; extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - - /* - * The DS context - part of struct thread_struct. + * Context switch work */ -struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ - unsigned char *ds; - /* the owner of the BTS and PEBS configuration, respectively */ - struct task_struct *owner[2]; - /* buffer overflow notification function for BTS and PEBS */ - ds_ovfl_callback_t callback[2]; - /* the original buffer address */ - void *buffer[2]; - /* the number of allocated pages for on-request allocated buffers */ - unsigned int pages[2]; - /* use count */ - unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ - struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ - struct task_struct *task; -}; +extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); -/* called by exit_thread() to free leftover contexts */ -extern void ds_free(struct ds_context *context); +/* + * Task clone/init and cleanup work + */ +extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); +extern void ds_exit_thread(struct task_struct *tsk); #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} +static inline void ds_switch_to(struct task_struct *prev, + struct task_struct *next) {} +static inline void ds_copy_thread(struct task_struct *tsk, + struct task_struct *father) {} +static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 804b6e6be92..3afc5e87cfd 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -6,56 +6,91 @@ #endif /* - Macros for dwarf2 CFI unwind table entries. - See "as.info" for details on these pseudo ops. Unfortunately - they are only supported in very new binutils, so define them - away for older version. + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. */ #ifdef CONFIG_AS_CFI -#define CFI_STARTPROC .cfi_startproc -#define CFI_ENDPROC .cfi_endproc -#define CFI_DEF_CFA .cfi_def_cfa -#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register -#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset -#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset -#define CFI_OFFSET .cfi_offset -#define CFI_REL_OFFSET .cfi_rel_offset -#define CFI_REGISTER .cfi_register -#define CFI_RESTORE .cfi_restore -#define CFI_REMEMBER_STATE .cfi_remember_state -#define CFI_RESTORE_STATE .cfi_restore_state -#define CFI_UNDEFINED .cfi_undefined +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined #ifdef CONFIG_AS_CFI_SIGNAL_FRAME -#define CFI_SIGNAL_FRAME .cfi_signal_frame +#define CFI_SIGNAL_FRAME .cfi_signal_frame #else #define CFI_SIGNAL_FRAME #endif #else -/* Due to the structure of pre-exisiting code, don't use assembler line - comment character # to ignore the arguments. Instead, use a dummy macro. */ +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ .macro cfi_ignore a=0, b=0, c=0, d=0 .endm -#define CFI_STARTPROC cfi_ignore -#define CFI_ENDPROC cfi_ignore -#define CFI_DEF_CFA cfi_ignore +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore #define CFI_DEF_CFA_REGISTER cfi_ignore #define CFI_DEF_CFA_OFFSET cfi_ignore #define CFI_ADJUST_CFA_OFFSET cfi_ignore -#define CFI_OFFSET cfi_ignore -#define CFI_REL_OFFSET cfi_ignore -#define CFI_REGISTER cfi_ignore -#define CFI_RESTORE cfi_ignore -#define CFI_REMEMBER_STATE cfi_ignore -#define CFI_RESTORE_STATE cfi_ignore -#define CFI_UNDEFINED cfi_ignore -#define CFI_SIGNAL_FRAME cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore #endif +/* + * An attempt to make CFI annotations more or less + * correct and shorter. It is implied that you know + * what you're doing if you use them. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 + .macro pushq_cfi reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popq_cfi reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro movq_cfi reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm +#else /*!CONFIG_X86_64*/ + + /* 32bit defenitions are missed yet */ + +#endif /*!CONFIG_X86_64*/ +#endif /*__ASSEMBLY__*/ + #endif /* _ASM_X86_DWARF2_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index a2e545c91c3..ca5ffb2856b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); #endif /* CONFIG_X86_32 */ +extern int add_efi_memmap; extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 40ca1bea791..f51a3ddde01 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -325,7 +325,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int executable_stack); + int uses_interp); extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 94826cf8745..cc70c1c78ca 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -8,7 +8,9 @@ enum reboot_type { BOOT_BIOS = 'b', #endif BOOT_ACPI = 'a', - BOOT_EFI = 'e' + BOOT_EFI = 'e', + BOOT_CF9 = 'p', + BOOT_CF9_COND = 'q', }; extern enum reboot_type reboot_type; diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 380f0b4f17e..bc53d5ef138 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,31 +9,27 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus_cluster(void) { -#if defined CONFIG_ES7000_CLUSTERED_APIC - return CPU_MASK_ALL; -#else - return cpumask_of_cpu(smp_processor_id()); -#endif + return &CPU_MASK_ALL; } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE (dest_LowestPrio) -#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ (1) -#undef WAKE_SECONDARY_VIA_INIT -#define WAKE_SECONDARY_VIA_MIP -#else +static inline const cpumask_t *target_cpus(void) +{ + return &cpumask_of_cpu(smp_processor_id()); +} + +#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER (1) + #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target procs */ #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#define WAKE_SECONDARY_VIA_INIT -#endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ +static inline void init_apic_ldr_cluster(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + static inline void init_apic_ldr(void) { unsigned long val; @@ -70,17 +76,14 @@ static inline void init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? - "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]); + "Physical Cluster" : "Logical Cluster", + nr_ioapics, cpus_addr(*target_cpus())[0]); } static inline int multi_timer_check(int apic, int irq) @@ -98,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) return boot_cpu_physical_apicid; - else if (mps_cpu < NR_CPUS) + else if (mps_cpu < nr_cpu_ids) return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; @@ -118,9 +121,9 @@ extern u8 cpu_2_logical_apicid[]; static inline int cpu_to_logical_apicid(int cpu) { #ifdef CONFIG_SMP - if (cpu >= NR_CPUS) - return BAD_APICID; - return (int)cpu_2_logical_apicid[cpu]; + if (cpu >= nr_cpu_ids) + return BAD_APICID; + return (int)cpu_2_logical_apicid[cpu]; #else return logical_smp_processor_id(); #endif @@ -144,38 +147,64 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int +cpu_mask_to_apicid_cluster(const struct cpumask *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpumask_weight(cpumask); /* Return id to all */ - if (num_bits_set == NR_CPUS) -#if defined CONFIG_ES7000_CLUSTERED_APIC + if (num_bits_set == nr_cpu_ids) return 0xFF; -#else - return cpu_to_logical_apicid(0); -#endif /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpumask_test_cpu(cpu, cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n", __func__); -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(*cpumask); + /* Return id to all */ + if (num_bits_set == nr_cpu_ids) + return cpu_to_logical_apicid(0); + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(*cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, *cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); return cpu_to_logical_apicid(0); -#endif } apicid = new_apicid; cpus_found++; @@ -185,6 +214,24 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) return apicid; } + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask) +{ + int apicid = cpu_to_logical_apicid(0); + cpumask_var_t cpumask; + + if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) + return apicid; + + cpumask_and(cpumask, inmask, andmask); + cpumask_and(cpumask, cpumask, cpu_online_mask); + apicid = cpu_mask_to_apicid(cpumask); + + free_cpumask_var(cpumask); + return apicid; +} + static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h index 632a955fcc0..7e8ed24d4b8 100644 --- a/arch/x86/include/asm/es7000/ipi.h +++ b/arch/x86/include/asm/es7000/ipi.h @@ -1,24 +1,22 @@ #ifndef __ASM_ES7000_IPI_H #define __ASM_ES7000_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_ES7000_IPI_H */ diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h index 39849346191..78f0daaee43 100644 --- a/arch/x86/include/asm/es7000/wakecpu.h +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -1,36 +1,12 @@ #ifndef __ASM_ES7000_WAKECPU_H #define __ASM_ES7000_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#ifdef CONFIG_ES7000_CLUSTERED_APIC -#define WAKE_SECONDARY_VIA_MIP -#else -#define WAKE_SECONDARY_VIA_INIT -#endif - -#ifdef WAKE_SECONDARY_VIA_MIP -extern int es7000_start_cpu(int cpu, unsigned long eip); -static inline int -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) -{ - int boot_error = 0; - boot_error = es7000_start_cpu(phys_apicid, start_eip); - return boot_error; -} -#endif - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW 0x467 +#define TRAMPOLINE_PHYS_HIGH 0x469 static inline void wait_for_init_deassert(atomic_t *deassert) { -#ifdef WAKE_SECONDARY_VIA_INIT +#ifndef CONFIG_ES7000_CLUSTERED_APIC while (!atomic_read(deassert)) cpu_relax(); #endif @@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29b8b1..b55b4a7fbef 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,6 +1,33 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H +#ifdef __ASSEMBLY__ + + .macro MCOUNT_SAVE_FRAME + /* taken from glibc */ + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + .endm + + .macro MCOUNT_RESTORE_FRAME + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + .endm + +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -17,8 +44,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) */ return addr - 1; } -#endif +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { + /* No extra data needed for x86 */ +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry_32/64.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 74252264433..6cfdafa409d 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -29,6 +29,39 @@ extern int fix_aperture; #define AMD64_GARTCACHECTL 0x9c #define AMD64_GARTEN (1<<0) +#ifdef CONFIG_GART_IOMMU +extern int gart_iommu_aperture; +extern int gart_iommu_aperture_allowed; +extern int gart_iommu_aperture_disabled; + +extern void early_gart_iommu_check(void); +extern void gart_iommu_init(void); +extern void gart_iommu_shutdown(void); +extern void __init gart_parse_options(char *); +extern void gart_iommu_hole_init(void); + +#else +#define gart_iommu_aperture 0 +#define gart_iommu_aperture_allowed 0 +#define gart_iommu_aperture_disabled 1 + +static inline void early_gart_iommu_check(void) +{ +} +static inline void gart_iommu_init(void) +{ +} +static inline void gart_iommu_shutdown(void) +{ +} +static inline void gart_parse_options(char *options) +{ +} +static inline void gart_iommu_hole_init(void) +{ +} +#endif + extern int agp_amd64_init(void); static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 5cbd4fcc06f..746f37a7963 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -2,6 +2,7 @@ #define _ASM_X86_GENAPIC_32_H #include <asm/mpspec.h> +#include <asm/atomic.h> /* * Generic APIC driver interface. @@ -23,7 +24,7 @@ struct genapic { int (*probe)(void); int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); + const struct cpumask *(*target_cpus)(void); int int_delivery_mode; int int_dest_mode; int ESR_DISABLE; @@ -56,15 +57,27 @@ struct genapic { unsigned (*get_apic_id)(unsigned long x); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); - cpumask_t (*vector_allocation_domain)(int cpu); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); #ifdef CONFIG_SMP /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); #endif + int (*wakeup_cpu)(int apicid, unsigned long start_eip); + int trampoline_phys_low; + int trampoline_phys_high; + void (*wait_for_init_deassert)(atomic_t *deassert); + void (*smp_callin_clear_local_apic)(void); + void (*store_NMI_vector)(unsigned short *high, unsigned short *low); + void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); + void (*inquire_remote_apic)(int apicid); }; #define APICFUNC(x) .x = x, @@ -105,16 +118,25 @@ struct genapic { APICFUNC(get_apic_id) \ .apic_id_mask = APIC_ID_MASK, \ APICFUNC(cpu_mask_to_apicid) \ - APICFUNC(vector_allocation_domain) \ + APICFUNC(cpu_mask_to_apicid_and) \ + APICFUNC(vector_allocation_domain) \ APICFUNC(acpi_madt_oem_check) \ IPIFUNC(send_IPI_mask) \ IPIFUNC(send_IPI_allbutself) \ IPIFUNC(send_IPI_all) \ APICFUNC(enable_apic_mode) \ APICFUNC(phys_pkg_id) \ + .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \ + .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \ + APICFUNC(wait_for_init_deassert) \ + APICFUNC(smp_callin_clear_local_apic) \ + APICFUNC(store_NMI_vector) \ + APICFUNC(restore_NMI_vector) \ + APICFUNC(inquire_remote_apic) \ } extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void); enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define get_uv_system_type() UV_NONE diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 13c4e96199e..adf32fb56aa 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_GENAPIC_64_H #define _ASM_X86_GENAPIC_64_H +#include <linux/cpumask.h> + /* * Copyright 2004 James Cleverdon, IBM. * Subject to the GNU Public License, v.2 @@ -18,20 +20,26 @@ struct genapic { u32 int_delivery_mode; u32 int_dest_mode; int (*apic_id_registered)(void); - cpumask_t (*target_cpus)(void); - cpumask_t (*vector_allocation_domain)(int cpu); + const struct cpumask *(*target_cpus)(void); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); void (*init_apic_ldr)(void); /* ipi */ - void (*send_IPI_mask)(cpumask_t mask, int vector); + void (*send_IPI_mask)(const struct cpumask *mask, int vector); + void (*send_IPI_mask_allbutself)(const struct cpumask *mask, + int vector); void (*send_IPI_allbutself)(int vector); void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); /* */ - unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask); + unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); + unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask); unsigned int (*phys_pkg_id)(int index_msb); unsigned int (*get_apic_id)(unsigned long x); unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; + /* wakeup_secondary_cpu */ + int (*wakeup_cpu)(int apicid, unsigned long start_eip); }; extern struct genapic *genapic; diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index 5ca135e72f2..cf7954d1405 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -22,6 +22,8 @@ DECLARE_PER_CPU(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT #define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member) +#define inc_irq_stat(member) (__get_cpu_var(irq_stat).member++) + void ack_bad_irq(unsigned int irq); #include <linux/irq_cpustat.h> diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h index 1ba381fc51d..b5a6b5d5670 100644 --- a/arch/x86/include/asm/hardirq_64.h +++ b/arch/x86/include/asm/hardirq_64.h @@ -11,6 +11,8 @@ #define __ARCH_IRQ_STAT 1 +#define inc_irq_stat(member) add_pda(member, 1) + #define local_softirq_pending() read_pda(__softirq_pending) #define __ARCH_SET_SOFTIRQ_PENDING 1 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b97aecb0b61..8de644b6b95 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -109,9 +109,7 @@ extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif #endif -#ifdef CONFIG_X86_32 -extern void (*const interrupt[NR_VECTORS])(void); -#endif +extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h new file mode 100644 index 00000000000..369f5c5d09a --- /dev/null +++ b/arch/x86/include/asm/hypervisor.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__HYPERVISOR_H +#define ASM_X86__HYPERVISOR_H + +extern unsigned long get_hypervisor_tsc_freq(void); +extern void init_hypervisor(struct cpuinfo_x86 *c); + +#endif diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index 97989c0e534..50ca486fd88 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -129,24 +129,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -struct sigframe32 { - u32 pretcode; - int sig; - struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; - unsigned int extramask[_COMPAT_NSIG_WORDS-1]; -}; - -struct rt_sigframe32 { - u32 pretcode; - int sig; - u32 pinfo; - u32 puc; - compat_siginfo_t info; - struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; -}; - struct ustat32 { __u32 f_tfree; compat_ino_t f_tinode; diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h index 44c89c3a23e..38d87379e27 100644 --- a/arch/x86/include/asm/idle.h +++ b/arch/x86/include/asm/idle.h @@ -8,8 +8,13 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); void idle_notifier_unregister(struct notifier_block *n); +#ifdef CONFIG_X86_64 void enter_idle(void); void exit_idle(void); +#else /* !CONFIG_X86_64 */ +static inline void enter_idle(void) { } +static inline void exit_idle(void) { } +#endif /* CONFIG_X86_64 */ void c1e_remove_cpu(int cpu); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ac2abc88cd9..05cfed4485f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include <linux/compiler.h> +#include <asm-generic/int-ll64.h> #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -45,21 +46,39 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define mmiowb() barrier() #ifdef CONFIG_X86_64 + build_mmio_read(readq, "q", unsigned long, "=r", :"memory") -build_mmio_read(__readq, "q", unsigned long, "=r", ) build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -build_mmio_write(__writeq, "q", unsigned long, "r", ) -#define readq_relaxed(a) __readq(a) -#define __raw_readq __readq -#define __raw_writeq writeq +#else + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 low, high; + + low = readl(p); + high = readl(p + 1); + + return low + ((u64)high << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} -/* Let people know we have them */ -#define readq readq -#define writeq writeq #endif -extern int iommu_bio_merge; +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq #ifdef CONFIG_X86_32 # include "io_32.h" diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h index fea325a1122..563c16270ba 100644 --- a/arch/x86/include/asm/io_64.h +++ b/arch/x86/include/asm/io_64.h @@ -232,8 +232,6 @@ void memset_io(volatile void __iomem *a, int b, size_t c); #define flush_write_buffers() -#define BIO_VMERGE_BOUNDARY iommu_bio_merge - /* * Convert a virtual cached pointer to an uncached pointer */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6afd9933a7d..7a1f44ac1f1 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -156,11 +156,21 @@ extern int sis_apic_bug; /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; +/* 1 if "noapic" boot option passed */ +extern int noioapicquirk; + +/* -1 if "noapic" boot option passed */ +extern int noioapicreroute; + /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; static inline void disable_ioapic_setup(void) { +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif skip_ioapic_setup = 1; } @@ -188,17 +198,14 @@ extern void restore_IO_APIC_setup(void); extern void reinit_intr_remapped_IO_APIC(int); #endif -extern int probe_nr_irqs(void); +extern void probe_nr_irqs_gsi(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_init_mappings(void) { } -static inline int probe_nr_irqs(void) -{ - return NR_IRQS; -} +static inline void probe_nr_irqs_gsi(void) { } #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 0b500c5b644..a6ee9e6f530 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -7,42 +7,7 @@ extern struct dma_mapping_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; -extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len); - /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) -#ifdef CONFIG_GART_IOMMU -extern int gart_iommu_aperture; -extern int gart_iommu_aperture_allowed; -extern int gart_iommu_aperture_disabled; - -extern void early_gart_iommu_check(void); -extern void gart_iommu_init(void); -extern void gart_iommu_shutdown(void); -extern void __init gart_parse_options(char *); -extern void gart_iommu_hole_init(void); - -#else -#define gart_iommu_aperture 0 -#define gart_iommu_aperture_allowed 0 -#define gart_iommu_aperture_disabled 1 - -static inline void early_gart_iommu_check(void) -{ -} -static inline void gart_iommu_init(void) -{ -} -static inline void gart_iommu_shutdown(void) -{ -} -static inline void gart_parse_options(char *options) -{ -} -static inline void gart_iommu_hole_init(void) -{ -} -#endif - #endif /* _ASM_X86_IOMMU_H */ diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h index f89dffb28aa..c745a306f7d 100644 --- a/arch/x86/include/asm/ipi.h +++ b/arch/x86/include/asm/ipi.h @@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector, native_apic_mem_write(APIC_ICR, cfg); } -static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) +static inline void send_IPI_mask_sequence(const struct cpumask *mask, + int vector) { unsigned long flags; unsigned long query_cpu; @@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) * - mbligh */ local_irq_save(flags); - for_each_cpu_mask_nr(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static inline void send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __send_IPI_dest_field( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); +} + #endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index bae0eda9548..592688ed04d 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -31,13 +31,9 @@ static inline int irq_canonicalize(int irq) # endif #endif -#ifdef CONFIG_IRQBALANCE -extern int irqbalance_disable(char *str); -#endif - #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> -extern void fixup_irqs(cpumask_t map); +extern void fixup_irqs(void); #endif extern unsigned int do_IRQ(struct pt_regs *regs); @@ -46,5 +42,6 @@ extern void native_init_IRQ(void); /* Interrupt vector management */ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); +extern int vector_used_by_percpu_irq(unsigned int vector); #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h index af2f02d27fc..86afd747345 100644 --- a/arch/x86/include/asm/irq_regs_32.h +++ b/arch/x86/include/asm/irq_regs_32.h @@ -9,6 +9,8 @@ #include <asm/percpu.h> +#define ARCH_HAS_OWN_IRQ_REGS + DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..f7ff65032b9 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -101,12 +101,23 @@ #define LAST_VM86_IRQ 15 #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) +#define NR_IRQS_LEGACY 16 + #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) + +#ifndef CONFIG_SPARSE_IRQ # if NR_CPUS < MAX_IO_APICS # define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) # else # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif +#else +# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +# else +# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif +#endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index a1f22771a15..c61d8b2ab8b 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -5,21 +5,8 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 -# define VA_PGD 3 -# define PA_PTE_0 4 -# define VA_PTE_0 5 -# define PA_PTE_1 6 -# define VA_PTE_1 7 -# define PA_SWAP_PAGE 8 -# ifdef CONFIG_X86_PAE -# define PA_PMD_0 9 -# define VA_PMD_0 10 -# define PA_PMD_1 11 -# define VA_PMD_1 12 -# define PAGES_NR 13 -# else -# define PAGES_NR 9 -# endif +# define PA_SWAP_PAGE 3 +# define PAGES_NR 4 #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 @@ -170,6 +157,20 @@ relocate_kernel(unsigned long indirection_page, unsigned long start_address) ATTRIB_NORET; #endif +#ifdef CONFIG_X86_32 +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + pgd_t *pgd; +#ifdef CONFIG_X86_PAE + pmd_t *pmd0; + pmd_t *pmd1; +#endif + pte_t *pte0; + pte_t *pte1; +}; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8346be87cfa..730843d1d2f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -21,6 +21,7 @@ #include <asm/pvclock-abi.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 @@ -86,6 +87,7 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 extern spinlock_t kvm_lock; @@ -180,6 +182,8 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head oos_link; + /* * The following two entries are used to key the shadow page in the * hash table. @@ -190,13 +194,16 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - unsigned long slot_bitmap; /* One bit set per slot which has memory - * in this shadow page. - */ + /* + * One bit set per slot which has memory + * in this shadow page. + */ + DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool unsync_children; + bool global; + unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ @@ -327,8 +334,10 @@ struct kvm_vcpu_arch { bool nmi_pending; bool nmi_injected; + bool nmi_window_open; - u64 mtrr[0x100]; + struct mtrr_state_type mtrr_state; + u32 pat; }; struct kvm_mem_alias { @@ -350,11 +359,13 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; - struct dmar_domain *intel_iommu_domain; + struct list_head oos_global_pages; + struct iommu_domain *iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; struct hlist_head irq_ack_notifier_list; + int vapics_in_nmi_mode; int round_robin_prev_vcpu; unsigned int tss_addr; @@ -378,6 +389,7 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; + u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -397,6 +409,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; + u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; @@ -405,6 +418,7 @@ struct kvm_vcpu_stat { u32 insn_emulation_fail; u32 hypercalls; u32 irq_injections; + u32 nmi_injections; }; struct descriptor_table { @@ -477,6 +491,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); + int (*get_mt_mask_shift)(void); }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); @@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); + const u8 *new, int bytes, + bool guest_initiated); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -607,6 +624,8 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" -#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" -#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - #define MSR_IA32_TIME_STAMP_COUNTER 0x010 #define TSS_IOPB_BASE_OFFSET 0x66 diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 25179a29f20..6a159732881 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -123,6 +123,7 @@ struct decode_cache { u8 ad_bytes; u8 rex_prefix; struct operand src; + struct operand src2; struct operand dst; bool has_seg_override; u8 seg_override; @@ -146,22 +147,18 @@ struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; - /* Linear faulting address (if emulating a page-faulting instruction) */ unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* decode cache */ - struct decode_cache decode; }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -170,7 +167,7 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ /* Host execution mode. */ -#if defined(__i386__) +#if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 #elif defined(CONFIG_X86_64) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index d28a507cef3..1caf57628b9 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -15,7 +15,7 @@ #define SHARED_SWITCHER_PAGES \ DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) /* Pages for switcher itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) +#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) /* We map at -4M for ease of mapping into the guest (one PTE page). */ #define SWITCHER_ADDR 0xFFC00000 diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index f61ee8f937e..5d98d0b68ff 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -57,5 +57,65 @@ #define __ALIGN_STR ".align 16,0x90" #endif +/* + * to check ENTRY_X86/END_X86 and + * KPROBE_ENTRY_X86/KPROBE_END_X86 + * unbalanced-missed-mixed appearance + */ +#define __set_entry_x86 .set ENTRY_X86_IN, 0 +#define __unset_entry_x86 .set ENTRY_X86_IN, 1 +#define __set_kprobe_x86 .set KPROBE_X86_IN, 0 +#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1 + +#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed" + +#define __check_entry_x86 \ + .ifdef ENTRY_X86_IN; \ + .ifeq ENTRY_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_kprobe_x86 \ + .ifdef KPROBE_X86_IN; \ + .ifeq KPROBE_X86_IN; \ + __macro_err_x86; \ + .abort; \ + .endif; \ + .endif + +#define __check_entry_kprobe_x86 \ + __check_entry_x86; \ + __check_kprobe_x86 + +#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 + +#define ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_entry_x86; \ + .globl name; \ + __ALIGN; \ + name: + +#define END_X86(name) \ + __unset_entry_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name + +#define KPROBE_ENTRY_X86(name) \ + __check_entry_kprobe_x86; \ + __set_kprobe_x86; \ + .pushsection .kprobes.text, "ax"; \ + .globl name; \ + __ALIGN; \ + name: + +#define KPROBE_END_X86(name) \ + __unset_kprobe_x86; \ + __check_entry_kprobe_x86; \ + .size name, .-name; \ + .popsection + #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index ff3a6c236c0..cc09cbbee27 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -8,12 +8,12 @@ #define APIC_DFR_VALUE (APIC_DFR_FLAT) -static inline cpumask_t target_cpus(void) +static inline const struct cpumask *target_cpus(void) { #ifdef CONFIG_SMP - return cpu_online_map; + return cpu_online_mask; #else - return cpumask_of_cpu(0); + return cpumask_of(0); #endif } @@ -28,15 +28,18 @@ static inline cpumask_t target_cpus(void) #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define phys_pkg_id (genapic->phys_pkg_id) #define vector_allocation_domain (genapic->vector_allocation_domain) #define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) #define send_IPI_self (genapic->send_IPI_self) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void setup_apic_routing(void); #else #define INT_DELIVERY_MODE dest_LowestPrio #define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ #define TARGET_CPUS (target_cpus()) +#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init /* * Set up the logical destination ID. * @@ -59,9 +62,19 @@ static inline int apic_id_registered(void) return physid_isset(read_apic_id(), phys_cpu_present_map); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask) { - return cpus_addr(cpumask)[0]; + return cpumask_bits(cpumask)[0]; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + unsigned long mask1 = cpumask_bits(cpumask)[0]; + unsigned long mask2 = cpumask_bits(andmask)[0]; + unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; + + return (unsigned int)(mask1 & mask2 & mask3); } static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) @@ -86,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid) #endif } -static inline cpumask_t vector_allocation_domain(int cpu) +static inline void vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -96,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; } #endif @@ -129,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS && cpu_present(mps_cpu)) + if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h index fabca01ebac..191312d155d 100644 --- a/arch/x86/include/asm/mach-default/mach_ipi.h +++ b/arch/x86/include/asm/mach-default/mach_ipi.h @@ -4,7 +4,8 @@ /* Avoid include hell */ #define NMI_VECTOR 0x02 -void send_IPI_mask_bitmask(cpumask_t mask, int vector); +void send_IPI_mask_bitmask(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); void __send_IPI_shortcut(unsigned int shortcut, int vector); extern int no_broadcast; @@ -12,28 +13,27 @@ extern int no_broadcast; #ifdef CONFIG_X86_64 #include <asm/genapic.h> #define send_IPI_mask (genapic->send_IPI_mask) +#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself) #else -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_bitmask(mask, vector); } +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); #endif static inline void __local_send_IPI_allbutself(int vector) { - if (no_broadcast || vector == NMI_VECTOR) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - send_IPI_mask(mask, vector); - } else + if (no_broadcast || vector == NMI_VECTOR) + send_IPI_mask_allbutself(cpu_online_mask, vector); + else __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } static inline void __local_send_IPI_all(int vector) { if (no_broadcast || vector == NMI_VECTOR) - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector); } diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h index 9d80db91e99..ceb01366014 100644 --- a/arch/x86/include/asm/mach-default/mach_wakecpu.h +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -1,17 +1,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#define WAKE_SECONDARY_VIA_INIT - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW (0x467) +#define TRAMPOLINE_PHYS_HIGH (0x469) static inline void wait_for_init_deassert(atomic_t *deassert) { @@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { } -#define inquire_remote_apic(apicid) do { \ - if (apic_verbosity >= APIC_DEBUG) \ - __inquire_remote_apic(apicid); \ - } while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ + if (apic_verbosity >= APIC_DEBUG) + __inquire_remote_apic(apicid); +} #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h index dbab36d64d4..23bf52103b8 100644 --- a/arch/x86/include/asm/mach-default/smpboot_hooks.h +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) CMOS_WRITE(0xa, 0xf); local_flush_tlb(); pr_debug("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + start_eip >> 4; pr_debug("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + start_eip & 0xf; pr_debug("3.\n"); } @@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ CMOS_WRITE(0, 0xf); - *((volatile long *) phys_to_virt(0x467)) = 0; + *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index 5180bd7478f..48553e958ad 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -24,9 +24,11 @@ #define check_phys_apicid_present (genapic->check_phys_apicid_present) #define check_apicid_used (genapic->check_apicid_used) #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid) +#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and) #define vector_allocation_domain (genapic->vector_allocation_domain) #define enable_apic_mode (genapic->enable_apic_mode) #define phys_pkg_id (genapic->phys_pkg_id) +#define wakeup_secondary_cpu (genapic->wakeup_cpu) extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h new file mode 100644 index 00000000000..1ab16b168c8 --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H + +#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +#define wait_for_init_deassert (genapic->wait_for_init_deassert) +#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +#define store_NMI_vector (genapic->store_NMI_vector) +#define restore_NMI_vector (genapic->restore_NMI_vector) +#define inquire_remote_apic (genapic->inquire_remote_apic) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h index 8e10015781f..7e98ce1d2c0 100644 --- a/arch/x86/include/asm/mmu_context_32.h +++ b/arch/x86/include/asm/mmu_context_32.h @@ -4,9 +4,8 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -20,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev, /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + x86_write_percpu(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); @@ -36,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, } #ifdef CONFIG_SMP else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); + x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 91885c28f66..62d14ce3cd0 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,13 +6,13 @@ #include <asm/mpspec_def.h> extern int apic_version[MAX_APICS]; +extern int pic_mode; #ifdef CONFIG_X86_32 #include <mach_mpspec.h> extern unsigned int def_to_bigsmp; extern u8 apicid_2_node[]; -extern int pic_mode; #ifdef CONFIG_X86_NUMAQ extern int mp_bus_id_to_node[MAX_MP_BUSSES]; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e38859d577a..cb58643947b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,7 +85,9 @@ /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ +#define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f +#define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c2a812ebde8..638bf624180 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -22,10 +22,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) } /* - * i386 calling convention returns 64-bit value in edx:eax, while - * x86_64 returns at rax. Also, the "A" constraint does not really - * mean rdx:rax in x86_64, so we need specialized behaviour for each - * architecture + * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" + * constraint has different meanings. For i386, "A" means exactly + * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, + * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 #define DECLARE_ARGS(val, low, high) unsigned low, high @@ -85,7 +85,8 @@ static inline void native_write_msr(unsigned int msr, asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory"); } -static inline int native_write_msr_safe(unsigned int msr, +/* Can be uninlined because referenced by paravirt */ +notrace static inline int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int err; @@ -181,10 +182,10 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) } #define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) + ((low) = (u32)__native_read_tsc()) #define rdtscll(val) \ - ((val) = native_read_tsc()) + ((val) = __native_read_tsc()) #define rdpmc(counter, low, high) \ do { \ diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 7c1e4258b31..cb988aab716 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -57,6 +57,31 @@ struct mtrr_gentry { }; #endif /* !__i386__ */ +struct mtrr_var_range { + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; +}; + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef u8 mtrr_type; + +#define MTRR_NUM_FIXED_RANGES 88 +#define MTRR_MAX_VAR_RANGES 256 + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + unsigned char have_fixed; + mtrr_type def_type; +}; + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + /* These are the various ioctls */ #define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) #define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h index 0bf2a06b7a4..bf37bc49bd8 100644 --- a/arch/x86/include/asm/numaq/apic.h +++ b/arch/x86/include/asm/numaq/apic.h @@ -7,9 +7,9 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { - return CPU_MASK_ALL; + return &CPU_MASK_ALL; } #define NO_BALANCE_IRQ (1) @@ -63,8 +63,8 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) extern u8 cpu_2_logical_apicid[]; static inline int cpu_to_logical_apicid(int cpu) { - if (cpu >= NR_CPUS) - return BAD_APICID; + if (cpu >= nr_cpu_ids) + return BAD_APICID; return (int)cpu_2_logical_apicid[cpu]; } @@ -122,7 +122,13 @@ static inline void enable_apic_mode(void) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) +{ + return (int) 0xF; +} + +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { return (int) 0xF; } diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h index 935588d286c..a8374c65277 100644 --- a/arch/x86/include/asm/numaq/ipi.h +++ b/arch/x86/include/asm/numaq/ipi.h @@ -1,25 +1,22 @@ #ifndef __ASM_NUMAQ_IPI_H #define __ASM_NUMAQ_IPI_H -void send_IPI_mask_sequence(cpumask_t, int vector); +void send_IPI_mask_sequence(const struct cpumask *mask, int vector); +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const struct cpumask *mask, int vector) { send_IPI_mask_sequence(mask, vector); } static inline void send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask_allbutself(cpu_online_mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(cpu_online_mask, vector); } #endif /* __ASM_NUMAQ_IPI_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h index c577bda5b1c..6f499df8edd 100644 --- a/arch/x86/include/asm/numaq/wakecpu.h +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -3,12 +3,8 @@ /* This file copes with machines that wakeup secondary CPUs by NMIs */ -#define WAKE_SECONDARY_VIA_NMI - -#define TRAMPOLINE_LOW phys_to_virt(0x8) -#define TRAMPOLINE_HIGH phys_to_virt(0xa) - -#define boot_cpu_apicid boot_cpu_logical_apicid +#define TRAMPOLINE_PHYS_LOW (0x8) +#define TRAMPOLINE_PHYS_HIGH (0xa) /* We don't do anything here because we use NMI's to boot instead */ static inline void wait_for_init_deassert(atomic_t *deassert) @@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void) static inline void store_NMI_vector(unsigned short *high, unsigned short *low) { printk("Storing NMI vector\n"); - *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); - *low = *((volatile unsigned short *) TRAMPOLINE_LOW); + *high = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); + *low = + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)); } static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) { printk("Restoring NMI vector\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; - *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = + *high; + *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = + *low; } -#define inquire_remote_apic(apicid) {} +static inline void inquire_remote_apic(int apicid) +{ +} #endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 875b38edf19..a977de23cb4 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -19,6 +19,8 @@ struct pci_sysdata { }; extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; /* scan a bus after allocating a pci_sysdata for it */ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, @@ -82,6 +84,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, static inline void early_quirks(void) { } #endif +extern void pci_iommu_alloc(void); + #endif /* __KERNEL__ */ #ifdef CONFIG_X86_32 @@ -98,9 +102,9 @@ static inline void early_quirks(void) { } #ifdef CONFIG_NUMA /* Returns the node based on pci bus */ -static inline int __pcibus_to_node(struct pci_bus *bus) +static inline int __pcibus_to_node(const struct pci_bus *bus) { - struct pci_sysdata *sd = bus->sysdata; + const struct pci_sysdata *sd = bus->sysdata; return sd->node; } @@ -109,6 +113,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) { return node_to_cpumask(__pcibus_to_node(bus)); } + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpumask_of_node(__pcibus_to_node(bus)); +} #endif #endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index d02d936840a..4da20798277 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -23,7 +23,6 @@ extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); extern void dma32_reserve_bootmem(void); -extern void pci_iommu_alloc(void); /* The PCI address space does equal the physical memory * address space. The networking and block device layers use diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h index 15b9cf6be72..e60fd3e14bd 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/include/asm/pci_x86.h @@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops; struct irq_info { u8 bus, devfn; /* Bus, device and function */ struct { - u8 link; /* IRQ line ID, chipset dependent, 0=not routed */ + u8 link; /* IRQ line ID, chipset dependent, + 0 = not routed */ u16 bitmap; /* Available IRQs */ } __attribute__((packed)) irq[4]; u8 slot; /* Slot number, 0=onboard */ @@ -69,11 +70,13 @@ struct irq_routing_table { u16 version; /* PIRQ_VERSION */ u16 size; /* Table size in bytes */ u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */ - u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */ - u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */ + u16 exclusive_irqs; /* IRQs devoted exclusively to + PCI usage */ + u16 rtr_vendor, rtr_device; /* Vendor and device ID of + interrupt router */ u32 miniport_data; /* Crap */ u8 rfu[11]; - u8 checksum; /* Modulo 256 checksum must give zero */ + u8 checksum; /* Modulo 256 checksum must give 0 */ struct irq_info slots[0]; } __attribute__((packed)); @@ -96,6 +99,7 @@ extern struct pci_raw_ops *raw_pci_ops; extern struct pci_raw_ops *raw_pci_ext_ops; extern struct pci_raw_ops pci_direct_conf1; +extern bool port_cf9_safe; /* arch_initcall level */ extern int pci_direct_probe(void); @@ -147,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos) static inline void mmio_config_writeb(void __iomem *pos, u8 val) { - asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writew(void __iomem *pos, u16 val) { - asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writel(void __iomem *pos, u32 val) { - asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index b17edfd2362..e0d199fe1d8 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -56,23 +56,55 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define pte_none(x) (!(x).pte_low) /* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset - * into this range: + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, + * split up the 29 bits of offset into this range: */ #define PTE_FILE_MAX_BITS 29 +#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) +#else +#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) +#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) +#endif +#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) +#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) #define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5)) + ((((pte).pte_low >> PTE_FILE_SHIFT1) \ + & ((1U << PTE_FILE_BITS1) - 1)) \ + + ((((pte).pte_low >> PTE_FILE_SHIFT2) \ + & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1) \ + + (((pte).pte_low >> PTE_FILE_SHIFT3) \ + << (PTE_FILE_BITS1 + PTE_FILE_BITS2))) #define pgoff_to_pte(off) \ - ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \ - (((off) >> 5) << 8) + _PAGE_FILE }) + ((pte_t) { .pte_low = \ + (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ + + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1)) \ + << PTE_FILE_SHIFT2) \ + + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ + << PTE_FILE_SHIFT3) \ + + _PAGE_FILE }) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 52597aeadff..447da43cddb 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -166,6 +166,7 @@ static inline int pte_none(pte_t pte) #define PTE_FILE_MAX_BITS 32 /* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) #define __swp_offset(x) ((x).val >> 5) #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c012f3b1167..83e69f4a37f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -10,7 +10,6 @@ #define _PAGE_BIT_PCD 4 /* page cache disabled */ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ -#define _PAGE_BIT_FILE 6 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ @@ -22,6 +21,12 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) @@ -46,11 +51,8 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -/* If _PAGE_PRESENT is clear, we use these: */ -#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, - * saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; - pte_present gives true */ +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) @@ -158,8 +160,19 @@ #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ #endif +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + #ifndef __ASSEMBLY__ +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -329,6 +342,9 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask) #ifndef __ASSEMBLY__ +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f9d5889b336..72b020deb46 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -101,15 +101,6 @@ extern unsigned long pg0[]; #endif /* - * Macro to mark a page protection value as "uncacheable". - * On processors which do not support it, this is a no-op. - */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \ - : (prot)) - -/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 545a0e042bb..ba09289acca 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -146,7 +146,7 @@ static inline void native_pgd_clear(pgd_t *pgd) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -#define MAXMEM _AC(0x00003fffffffffff, UL) +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) @@ -177,12 +177,6 @@ static inline int pmd_bad(pmd_t pmd) #define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */ /* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) \ - (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT)) - -/* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ @@ -250,10 +244,22 @@ static inline int pud_large(pud_t pte) extern int direct_gbpages; /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \ - ((offset) << 8) }) +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h index fe681147a4f..a8894647dd9 100644 --- a/arch/x86/include/asm/prctl.h +++ b/arch/x86/include/asm/prctl.h @@ -6,5 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 +#ifdef CONFIG_X86_64 +extern long sys_arch_prctl(int, unsigned long); +#endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e38326..091cd8855f2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + unsigned int x86_hyper_vendor; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -123,6 +124,9 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff +#define X86_HYPER_VENDOR_NONE 0 +#define X86_HYPER_VENDOR_VMWARE 1 + /* * capabilities of CPUs */ @@ -752,6 +756,19 @@ extern void switch_to_new_gdt(void); extern void cpu_init(void); extern void init_gdt(int cpu); +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index eefb0594b05..6d34d954c22 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -6,7 +6,6 @@ #include <asm/processor-flags.h> #ifdef __KERNEL__ -#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */ #include <asm/segment.h> #endif @@ -128,34 +127,6 @@ struct pt_regs { #endif /* !__i386__ */ -#ifdef CONFIG_X86_PTRACE_BTS -/* a branch trace record entry - * - * In order to unify the interface between various processor versions, - * we use the below data structure for all processors. - */ -enum bts_qualifier { - BTS_INVALID = 0, - BTS_BRANCH, - BTS_TASK_ARRIVES, - BTS_TASK_DEPARTS -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from_ip; - __u64 to_ip; - } lbr; - /* BTS_TASK_ARRIVES or - BTS_TASK_DEPARTS */ - __u64 jiffies; - } variant; -}; -#endif /* CONFIG_X86_PTRACE_BTS */ - #ifdef __KERNEL__ #include <linux/init.h> @@ -163,13 +134,6 @@ struct bts_struct { struct cpuinfo_x86; struct task_struct; -#ifdef CONFIG_X86_PTRACE_BTS -extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *); -extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier); -#else -#define ptrace_bts_init_intel(config) do {} while (0) -#endif /* CONFIG_X86_PTRACE_BTS */ - extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long @@ -271,6 +235,13 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); +extern void x86_ptrace_untrace(struct task_struct *); +extern void x86_ptrace_fork(struct task_struct *child, + unsigned long clone_flags); + +#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) +#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) + #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index df7710354f8..562d4fd31ba 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_REBOOT_H #define _ASM_X86_REBOOT_H +#include <linux/kdebug.h> + struct pt_regs; struct machine_ops { @@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); void machine_real_restart(const unsigned char *code, int length); +typedef void (*nmi_shootdown_cb)(int, struct die_args*); +void nmi_shootdown_cpus(nmi_shootdown_cb callback); + #endif /* _ASM_X86_REBOOT_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d3723746..4fcd53fd5f4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -8,6 +8,10 @@ /* Interrupt control for vSMPowered x86_64 systems */ void vsmp_init(void); + +void setup_bios_corruption_check(void); + + #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); extern int is_visws_box(void); @@ -16,6 +20,8 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif +extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip); /* * Any setup quirks to be performed? */ @@ -39,6 +45,7 @@ struct x86_quirks { void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); int (*setup_ioapic_ids)(void); + int (*update_genapic)(void); }; extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h new file mode 100644 index 00000000000..4e0fe26d27d --- /dev/null +++ b/arch/x86/include/asm/sigframe.h @@ -0,0 +1,70 @@ +#ifndef _ASM_X86_SIGFRAME_H +#define _ASM_X86_SIGFRAME_H + +#include <asm/sigcontext.h> +#include <asm/siginfo.h> +#include <asm/ucontext.h> + +#ifdef CONFIG_X86_32 +#define sigframe_ia32 sigframe +#define rt_sigframe_ia32 rt_sigframe +#define sigcontext_ia32 sigcontext +#define _fpstate_ia32 _fpstate +#define ucontext_ia32 ucontext +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#include <asm/ia32.h> +#endif /* CONFIG_IA32_EMULATION */ + +#endif /* CONFIG_X86_32 */ + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +struct sigframe_ia32 { + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate_ia32 fpstate_unused; +#ifdef CONFIG_IA32_EMULATION + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; +#else /* !CONFIG_IA32_EMULATION */ + unsigned long extramask[_NSIG_WORDS-1]; +#endif /* CONFIG_IA32_EMULATION */ + char retcode[8]; + /* fp state follows here */ +}; + +struct rt_sigframe_ia32 { + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; +#ifdef CONFIG_IA32_EMULATION + compat_siginfo_t info; +#else /* !CONFIG_IA32_EMULATION */ + struct siginfo info; +#endif /* CONFIG_IA32_EMULATION */ + struct ucontext_ia32 uc; + char retcode[8]; + /* fp state follows here */ +}; +#endif /* defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) */ + +#ifdef CONFIG_X86_64 +struct rt_sigframe { + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + /* fp state follows here */ +}; +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_SIGFRAME_H */ diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 96ac44f275d..7761a5d554b 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -121,6 +121,10 @@ typedef unsigned long sigset_t; #ifndef __ASSEMBLY__ +# ifdef __KERNEL__ +extern void do_notify_resume(struct pt_regs *, void *, __u32); +# endif /* __KERNEL__ */ + #ifdef __i386__ # ifdef __KERNEL__ struct old_sigaction { @@ -141,8 +145,6 @@ struct k_sigaction { struct sigaction sa; }; -extern void do_notify_resume(struct pt_regs *, void *, __u32); - # else /* __KERNEL__ */ /* Here we must cater to libcs that poke about in kernel headers. */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index d12811ce51d..830b9fcb642 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -60,7 +60,7 @@ struct smp_ops { void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); - void (*send_call_func_ipi)(cpumask_t mask); + void (*send_call_func_ipi)(const struct cpumask *mask); void (*send_call_func_single_ipi)(int cpu); }; @@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu) static inline void arch_send_call_function_ipi(cpumask_t mask) { - smp_ops.send_call_func_ipi(mask); + smp_ops.send_call_func_ipi(&mask); } void cpu_disable_common(void); @@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); -void native_send_call_func_ipi(cpumask_t mask); +void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); extern void prefill_possible_map(void); diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index be44f7dab39..e3cc3c063ec 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 +# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ #endif #endif /* CONFIG_SPARSEMEM */ diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h index 9b3070f1c2a..4bb5fb34f03 100644 --- a/arch/x86/include/asm/summit/apic.h +++ b/arch/x86/include/asm/summit/apic.h @@ -14,13 +14,13 @@ #define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static inline cpumask_t target_cpus(void) +static inline const cpumask_t *target_cpus(void) { /* CPU_MASK_ALL (0xff) has undefined behaviour with * dest_LowestPrio mode logical clustered apic interrupt routing * Just start on cpu 0. IRQ balancing will spread load */ - return cpumask_of_cpu(0); + return &cpumask_of_cpu(0); } #define INT_DELIVERY_MODE (dest_LowestPrio) @@ -52,7 +52,7 @@ static inline void init_apic_ldr(void) int i; /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = NR_CPUS; --i >= 0; ) { + for (count = 0, i = nr_cpu_ids; --i >= 0; ) { lid = cpu_2_logical_apicid[i]; if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster) ++count; @@ -97,8 +97,8 @@ static inline int apicid_to_node(int logical_apicid) static inline int cpu_to_logical_apicid(int cpu) { #ifdef CONFIG_SMP - if (cpu >= NR_CPUS) - return BAD_APICID; + if (cpu >= nr_cpu_ids) + return BAD_APICID; return (int)cpu_2_logical_apicid[cpu]; #else return logical_smp_processor_id(); @@ -107,7 +107,7 @@ static inline int cpu_to_logical_apicid(int cpu) static inline int cpu_present_to_apicid(int mps_cpu) { - if (mps_cpu < NR_CPUS) + if (mps_cpu < nr_cpu_ids) return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); else return BAD_APICID; @@ -137,25 +137,25 @@ static inline void enable_apic_mode(void) { } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask) { int num_bits_set; int cpus_found = 0; int cpu; int apicid; - num_bits_set = cpus_weight(cpumask); + num_bits_set = cpus_weight(*cpumask); /* Return id to all */ - if (num_bits_set == NR_CPUS) + if (num_bits_set >= nr_cpu_ids) return (int) 0xFF; /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. */ - cpu = first_cpu(cpumask); + cpu = first_cpu(*cpumask); apicid = cpu_to_logical_apicid(cpu); while (cpus_found < num_bits_set) { - if (cpu_isset(cpu, cpumask)) { + if (cpu_isset(cpu, *cpumask)) { int new_apicid = cpu_to_logical_apicid(cpu); if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ @@ -170,6 +170,23 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) return apicid; } +static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask) +{ + int apicid = cpu_to_logical_apicid(0); + cpumask_var_t cpumask; + + if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) + return apicid; + + cpumask_and(cpumask, inmask, andmask); + cpumask_and(cpumask, cpumask, cpu_online_mask); + apicid = cpu_mask_to_apicid(cpumask); + + free_cpumask_var(cpumask); + return apicid; +} + /* cpuid returns the value latched in the HW at reset, not the APIC ID * register's value. For any box whose BIOS changes APIC IDs, like * clustered APIC systems, we must use hard_smp_processor_id. diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h index 53bd1e7bd7b..a8a2c24f50c 100644 --- a/arch/x86/include/asm/summit/ipi.h +++ b/arch/x86/include/asm/summit/ipi.h @@ -1,9 +1,10 @@ #ifndef __ASM_SUMMIT_IPI_H #define __ASM_SUMMIT_IPI_H -void send_IPI_mask_sequence(cpumask_t mask, int vector); +void send_IPI_mask_sequence(const cpumask_t *mask, int vector); +void send_IPI_mask_allbutself(const cpumask_t *mask, int vector); -static inline void send_IPI_mask(cpumask_t mask, int vector) +static inline void send_IPI_mask(const cpumask_t *mask, int vector) { send_IPI_mask_sequence(mask, vector); } @@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector) cpu_clear(smp_processor_id(), mask); if (!cpus_empty(mask)) - send_IPI_mask(mask, vector); + send_IPI_mask(&mask, vector); } static inline void send_IPI_all(int vector) { - send_IPI_mask(cpu_online_map, vector); + send_IPI_mask(&cpu_online_map, vector); } #endif /* __ASM_SUMMIT_IPI_H */ diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h index 1b8afa78e86..1b8afa78e86 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/include/asm/svm.h diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 51fb2c76ad7..b9e4e20174f 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -1,46 +1,10 @@ #ifndef _ASM_X86_SWIOTLB_H #define _ASM_X86_SWIOTLB_H -#include <asm/dma-mapping.h> +#include <linux/swiotlb.h> /* SWIOTLB interface */ -extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, - size_t size, int dir); -extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags); -extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, int dir); -extern void swiotlb_sync_single_for_cpu(struct device *hwdev, - dma_addr_t dev_addr, - size_t size, int dir); -extern void swiotlb_sync_single_for_device(struct device *hwdev, - dma_addr_t dev_addr, - size_t size, int dir); -extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev, - dma_addr_t dev_addr, - unsigned long offset, - size_t size, int dir); -extern void swiotlb_sync_single_range_for_device(struct device *hwdev, - dma_addr_t dev_addr, - unsigned long offset, - size_t size, int dir); -extern void swiotlb_sync_sg_for_cpu(struct device *hwdev, - struct scatterlist *sg, int nelems, - int dir); -extern void swiotlb_sync_sg_for_device(struct device *hwdev, - struct scatterlist *sg, int nelems, - int dir); -extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); -extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); -extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr); -extern void swiotlb_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle); -extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); -extern void swiotlb_init(void); - extern int swiotlb_force; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h new file mode 100644 index 00000000000..ffb08be2a53 --- /dev/null +++ b/arch/x86/include/asm/sys_ia32.h @@ -0,0 +1,101 @@ +/* + * sys_ia32.h - Linux ia32 syscall interfaces + * + * Copyright (c) 2008 Jaswinder Singh Rajput + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#ifndef _ASM_X86_SYS_IA32_H +#define _ASM_X86_SYS_IA32_H + +#include <linux/compiler.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/signal.h> +#include <asm/compat.h> +#include <asm/ia32.h> + +/* ia32/sys_ia32.c */ +asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long); +asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); + +asmlinkage long sys32_stat64(char __user *, struct stat64 __user *); +asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); +asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); +asmlinkage long sys32_fstatat(unsigned int, char __user *, + struct stat64 __user *, int); +struct mmap_arg_struct; +asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); +asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); + +asmlinkage long sys32_pipe(int __user *); +struct sigaction32; +struct old_sigaction32; +asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, + struct sigaction32 __user *, unsigned int); +asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, + struct old_sigaction32 __user *); +asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *, + compat_sigset_t __user *, unsigned int); +asmlinkage long sys32_alarm(unsigned int); + +struct sel_arg_struct; +asmlinkage long sys32_old_select(struct sel_arg_struct __user *); +asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); +asmlinkage long sys32_sysfs(int, u32, u32); + +asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, + struct compat_timespec __user *); +asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); +asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); + +#ifdef CONFIG_SYSCTL_SYSCALL +struct sysctl_ia32; +asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *); +#endif + +asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); +asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); + +asmlinkage long sys32_personality(unsigned long); +asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); + +asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long); + +struct oldold_utsname; +struct old_utsname; +asmlinkage long sys32_olduname(struct oldold_utsname __user *); +long sys32_uname(struct old_utsname __user *); + +long sys32_ustat(unsigned, struct ustat32 __user *); + +asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, + compat_uptr_t __user *, struct pt_regs *); +asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); + +long sys32_lseek(unsigned int, int, unsigned int); +long sys32_kill(int, int); +long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); +long sys32_vm86_warning(void); +long sys32_lookup_dcookie(u32, u32, char __user *, size_t); + +asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t); +asmlinkage long sys32_sync_file_range(int, unsigned, unsigned, + unsigned, unsigned, int); +asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int); +asmlinkage long sys32_fallocate(int, int, unsigned, + unsigned, unsigned, unsigned); + +/* ia32/ia32_signal.c */ +asmlinkage long sys32_sigsuspend(int, int, old_sigset_t); +asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *, + stack_ia32_t __user *, struct pt_regs *); +asmlinkage long sys32_sigreturn(struct pt_regs *); +asmlinkage long sys32_rt_sigreturn(struct pt_regs *); + +/* ia32/ipc32.c */ +asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); +#endif /* _ASM_X86_SYS_IA32_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 87803da4401..9c6797c3e56 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -19,6 +19,13 @@ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/ldt.c */ +asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); + +/* kernel/tls.c */ +asmlinkage int sys_set_thread_area(struct user_desc __user *); +asmlinkage int sys_get_thread_area(struct user_desc __user *); + /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ @@ -33,14 +40,11 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); asmlinkage int sys_sigaltstack(unsigned long); asmlinkage unsigned long sys_sigreturn(unsigned long); -asmlinkage int sys_rt_sigreturn(unsigned long); +asmlinkage int sys_rt_sigreturn(struct pt_regs); /* kernel/ioport.c */ asmlinkage long sys_iopl(unsigned long); -/* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); - /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); @@ -54,10 +58,6 @@ asmlinkage int sys_uname(struct old_utsname __user *); struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); -/* kernel/tls.c */ -asmlinkage int sys_set_thread_area(struct user_desc __user *); -asmlinkage int sys_get_thread_area(struct user_desc __user *); - /* kernel/vm86_32.c */ asmlinkage int sys_vm86old(struct pt_regs); asmlinkage int sys_vm86(struct pt_regs); diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff..8e626ea33a1 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -17,12 +17,12 @@ # define AT_VECTOR_SIZE_ARCH 1 #endif -#ifdef CONFIG_X86_32 - struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +#ifdef CONFIG_X86_32 + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); +void stop_this_cpu(void *dummy); + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..98789647baa 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,11 +20,13 @@ struct task_struct; struct exec_domain; #include <asm/processor.h> +#include <asm/ftrace.h> +#include <asm/atomic.h> struct thread_info { struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - unsigned long flags; /* low level flags */ + __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, @@ -91,7 +93,6 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ -#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -113,7 +114,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) -#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -139,8 +139,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ - _TIF_NOTSC) + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4850e4b02b6..4e2f2e0aab2 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -61,13 +61,19 @@ static inline int cpu_to_node(int cpu) * * Side note: this function creates the returned cpumask on the stack * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. + * cpumask_of_node function should be used whenever possible. */ static inline cpumask_t node_to_cpumask(int node) { return node_to_cpumask_map[node]; } +/* Returns a bitmask of CPUs on Node 'node'. */ +static inline const struct cpumask *cpumask_of_node(int node) +{ + return &node_to_cpumask_map[node]; +} + #else /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ @@ -82,7 +88,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); extern int early_cpu_to_node(int cpu); -extern const cpumask_t *_node_to_cpumask_ptr(int node); +extern const cpumask_t *cpumask_of_node(int node); extern cpumask_t node_to_cpumask(int node); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ @@ -103,7 +109,7 @@ static inline int early_cpu_to_node(int cpu) } /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -static inline const cpumask_t *_node_to_cpumask_ptr(int node) +static inline const cpumask_t *cpumask_of_node(int node) { return &node_to_cpumask_map[node]; } @@ -116,12 +122,15 @@ static inline cpumask_t node_to_cpumask(int node) #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -/* Replace default node_to_cpumask_ptr with optimized version */ +/* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif /* CONFIG_X86_64 */ @@ -187,7 +196,7 @@ extern int __node_distance(int, int); #define cpu_to_node(cpu) 0 #define early_cpu_to_node(cpu) 0 -static inline const cpumask_t *_node_to_cpumask_ptr(int node) +static inline const cpumask_t *cpumask_of_node(int node) { return &cpu_online_map; } @@ -200,12 +209,15 @@ static inline int node_to_first_cpu(int node) return first_cpu(cpu_online_map); } -/* Replace default node_to_cpumask_ptr with optimized version */ +/* + * Replace default node_to_cpumask_ptr with optimized version + * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" + */ #define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = _node_to_cpumask_ptr(node) + const cpumask_t *v = cpumask_of_node(node) #define node_to_cpumask_ptr_next(v, node) \ - v = _node_to_cpumask_ptr(node) + v = cpumask_of_node(node) #endif #include <asm-generic/topology.h> @@ -214,18 +226,20 @@ static inline int node_to_first_cpu(int node) /* Returns the number of the first CPU on Node 'node'. */ static inline int node_to_first_cpu(int node) { - node_to_cpumask_ptr(mask, node); - return first_cpu(*mask); + return cpumask_first(cpumask_of_node(node)); } #endif extern cpumask_t cpu_coregroup_map(int cpu); +extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) +#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) /* indicates that pointers to the topology cpumask_t maps are valid */ #define arch_provides_topology_pointers yes @@ -239,7 +253,7 @@ struct pci_bus; void set_pci_bus_resources_arch_default(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (boot_cpu_data.x86_max_cores > 1) +#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids) #define smt_capable() (smp_num_siblings > 1) #endif diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index fa0d79facdb..780ba0ab94f 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,6 +3,7 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_X86_TRAMPOLINE /* * Trampoline 80x86 program as an array. */ @@ -13,8 +14,14 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 + extern unsigned long setup_trampoline(void); +extern void __init reserve_trampoline_memory(void); +#else +static inline void reserve_trampoline_memory(void) {}; +#endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 45dee286e45..2ee0a3bceed 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -46,6 +46,10 @@ dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long); dotraplinkage void do_invalid_TSS(struct pt_regs *, long); dotraplinkage void do_segment_not_present(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long); +#ifdef CONFIG_X86_64 +dotraplinkage void do_double_fault(struct pt_regs *, long); +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *); +#endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); @@ -72,10 +76,13 @@ static inline int get_si_code(unsigned long condition) extern int panic_on_unrecovered_nmi; extern int kstack_depth_to_print; -#ifdef CONFIG_X86_32 void math_error(void __user *); -unsigned long patch_espfix_desc(unsigned long, unsigned long); asmlinkage void math_emulate(long); +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long, unsigned long); +#else +asmlinkage void smp_thermal_interrupt(void); +asmlinkage void mce_threshold_interrupt(void); #endif #endif /* _ASM_X86_TRAPS_H */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9cd83a8e40d..38ae163cc91 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -34,8 +34,6 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t vget_cycles(void) { - cycles_t cycles; - /* * We only do VDSOs on TSC capable CPUs, so this shouldnt * access boot_cpu_data (which is not VDSO-safe): @@ -44,11 +42,7 @@ static __always_inline cycles_t vget_cycles(void) if (!cpu_has_tsc) return 0; #endif - rdtsc_barrier(); - cycles = (cycles_t)__native_read_tsc(); - rdtsc_barrier(); - - return cycles; + return (cycles_t)__native_read_tsc(); } extern void tsc_init(void); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99192bb55a5..4340055b755 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -352,14 +352,14 @@ do { \ #define __put_user_nocheck(x, ptr, size) \ ({ \ - long __pu_err; \ + int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ __pu_err; \ }) #define __get_user_nocheck(x, ptr, size) \ ({ \ - long __gu_err; \ + int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index d931d3b7e6f..7ed17ff502b 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -32,13 +32,18 @@ enum uv_bios_cmd { UV_BIOS_COMMON, UV_BIOS_GET_SN_INFO, - UV_BIOS_FREQ_BASE + UV_BIOS_FREQ_BASE, + UV_BIOS_WATCHLIST_ALLOC, + UV_BIOS_WATCHLIST_FREE, + UV_BIOS_MEMPROTECT, + UV_BIOS_GET_PARTITION_ADDR }; /* * Status values returned from a BIOS call. */ enum { + BIOS_STATUS_MORE_PASSES = 1, BIOS_STATUS_SUCCESS = 0, BIOS_STATUS_UNIMPLEMENTED = -ENOSYS, BIOS_STATUS_EINVAL = -EINVAL, @@ -71,6 +76,21 @@ union partition_info_u { }; }; +union uv_watchlist_u { + u64 val; + struct { + u64 blade : 16, + size : 32, + filler : 16; + }; +}; + +enum uv_memprotect { + UV_MEMPROT_RESTRICT_ACCESS, + UV_MEMPROT_ALLOW_AMO, + UV_MEMPROT_ALLOW_RW +}; + /* * bios calls have 6 parameters */ @@ -80,14 +100,20 @@ extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); +extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, + unsigned long *); +extern int uv_bios_mq_watchlist_free(int, int); +extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); +extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern void uv_bios_init(void); +extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; extern long sn_partition_id; -extern long uv_coherency_id; -extern long uv_region_size; -#define partition_coherence_id() (uv_coherency_id) +extern long sn_coherency_id; +extern long sn_region_size; +#define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index e2363253bbb..50423c7b56b 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -133,61 +133,61 @@ struct bau_msg_payload { * see table 4.2.3.0.1 in broacast_assist spec. */ struct bau_msg_header { - int dest_subnodeid:6; /* must be zero */ + unsigned int dest_subnodeid:6; /* must be zero */ /* bits 5:0 */ - int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ - /* bits 20:6 */ - int command:8; /* message type */ + unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ + /* bits 20:6 */ /* first bit in node_map */ + unsigned int command:8; /* message type */ /* bits 28:21 */ /* 0x38: SN3net EndPoint Message */ - int rsvd_1:3; /* must be zero */ + unsigned int rsvd_1:3; /* must be zero */ /* bits 31:29 */ /* int will align on 32 bits */ - int rsvd_2:9; /* must be zero */ + unsigned int rsvd_2:9; /* must be zero */ /* bits 40:32 */ /* Suppl_A is 56-41 */ - int payload_2a:8; /* becomes byte 16 of msg */ + unsigned int payload_2a:8;/* becomes byte 16 of msg */ /* bits 48:41 */ /* not currently using */ - int payload_2b:8; /* becomes byte 17 of msg */ + unsigned int payload_2b:8;/* becomes byte 17 of msg */ /* bits 56:49 */ /* not currently using */ /* Address field (96:57) is never used as an address (these are address bits 42:3) */ - int rsvd_3:1; /* must be zero */ + unsigned int rsvd_3:1; /* must be zero */ /* bit 57 */ /* address bits 27:4 are payload */ /* these 24 bits become bytes 12-14 of msg */ - int replied_to:1; /* sent as 0 by the source to byte 12 */ + unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ /* bit 58 */ - int payload_1a:5; /* not currently used */ + unsigned int payload_1a:5;/* not currently used */ /* bits 63:59 */ - int payload_1b:8; /* not currently used */ + unsigned int payload_1b:8;/* not currently used */ /* bits 71:64 */ - int payload_1c:8; /* not currently used */ + unsigned int payload_1c:8;/* not currently used */ /* bits 79:72 */ - int payload_1d:2; /* not currently used */ + unsigned int payload_1d:2;/* not currently used */ /* bits 81:80 */ - int rsvd_4:7; /* must be zero */ + unsigned int rsvd_4:7; /* must be zero */ /* bits 88:82 */ - int sw_ack_flag:1; /* software acknowledge flag */ + unsigned int sw_ack_flag:1;/* software acknowledge flag */ /* bit 89 */ /* INTD trasactions at destination are to wait for software acknowledge */ - int rsvd_5:6; /* must be zero */ + unsigned int rsvd_5:6; /* must be zero */ /* bits 95:90 */ - int rsvd_6:5; /* must be zero */ + unsigned int rsvd_6:5; /* must be zero */ /* bits 100:96 */ - int int_both:1; /* if 1, interrupt both sockets on the blade */ + unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */ /* bit 101*/ - int fairness:3; /* usually zero */ + unsigned int fairness:3;/* usually zero */ /* bits 104:102 */ - int multilevel:1; /* multi-level multicast format */ + unsigned int multilevel:1; /* multi-level multicast format */ /* bit 105 */ /* 0 for TLB: endpoint multi-unicast messages */ - int chaining:1; /* next descriptor is part of this activation*/ + unsigned int chaining:1;/* next descriptor is part of this activation*/ /* bit 106 */ - int rsvd_7:21; /* must be zero */ + unsigned int rsvd_7:21; /* must be zero */ /* bits 127:107 */ }; diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 7a5782610b2..777327ef05c 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -113,25 +113,37 @@ */ #define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +struct uv_scir_s { + struct timer_list timer; + unsigned long offset; + unsigned long last; + unsigned long idle_on; + unsigned long idle_off; + unsigned char state; + unsigned char enabled; +}; + /* * The following defines attributes of the HUB chip. These attributes are * frequently referenced and are kept in the per-cpu data areas of each cpu. * They are kept together in a struct to minimize cache misses. */ struct uv_hub_info_s { - unsigned long global_mmr_base; - unsigned long gpa_mask; - unsigned long gnode_upper; - unsigned long lowmem_remap_top; - unsigned long lowmem_remap_base; - unsigned short pnode; - unsigned short pnode_mask; - unsigned short coherency_domain_number; - unsigned short numa_blade_id; - unsigned char blade_processor_id; - unsigned char m_val; - unsigned char n_val; + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; + struct uv_scir_s scir; }; + DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) @@ -163,6 +175,30 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_APIC_PNODE_SHIFT 6 +/* Local Bus from cpu's perspective */ +#define LOCAL_BUS_BASE 0x1c00000 +#define LOCAL_BUS_SIZE (4 * 1024 * 1024) + +/* + * System Controller Interface Reg + * + * Note there are NO leds on a UV system. This register is only + * used by the system controller to monitor system-wide operation. + * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on + * a node. + * + * The window is located at top of ACPI MMR space + */ +#define SCIR_WINDOW_COUNT 64 +#define SCIR_LOCAL_MMR_BASE (LOCAL_BUS_BASE + \ + LOCAL_BUS_SIZE - \ + SCIR_WINDOW_COUNT) + +#define SCIR_CPU_HEARTBEAT 0x01 /* timer interrupt */ +#define SCIR_CPU_ACTIVITY 0x02 /* not idle */ +#define SCIR_CPU_HB_INTERVAL (HZ) /* once per second */ + /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. @@ -174,7 +210,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) - paddr += uv_hub_info->lowmem_remap_base; + paddr |= uv_hub_info->lowmem_remap_base; return paddr | uv_hub_info->gnode_upper; } @@ -182,19 +218,7 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) /* socket virtual --> UV global physical address */ static inline unsigned long uv_gpa(void *v) { - return __pa(v) | uv_hub_info->gnode_upper; -} - -/* socket virtual --> UV global physical address */ -static inline void *uv_vgpa(void *v) -{ - return (void *)uv_gpa(v); -} - -/* UV global physical address --> socket virtual */ -static inline void *uv_va(unsigned long gpa) -{ - return __va(gpa & uv_hub_info->gpa_mask); + return uv_soc_phys_ram_to_gpa(__pa(v)); } /* pnode, offset --> socket virtual */ @@ -277,6 +301,16 @@ static inline void uv_write_local_mmr(unsigned long offset, unsigned long val) *uv_local_mmr_address(offset) = val; } +static inline unsigned char uv_read_local_mmr8(unsigned long offset) +{ + return *((unsigned char *)uv_local_mmr_address(offset)); +} + +static inline void uv_write_local_mmr8(unsigned long offset, unsigned char val) +{ + *((unsigned char *)uv_local_mmr_address(offset)) = val; +} + /* * Structures and definitions for converting between cpu, node, pnode, and blade * numbers. @@ -351,5 +385,20 @@ static inline int uv_num_possible_blades(void) return uv_possible_blades; } -#endif /* _ASM_X86_UV_UV_HUB_H */ +/* Update SCIR state */ +static inline void uv_set_scir_bits(unsigned char value) +{ + if (uv_hub_info->scir.state != value) { + uv_hub_info->scir.state = value; + uv_write_local_mmr8(uv_hub_info->scir.offset, value); + } +} +static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) +{ + if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_cpu_hub_info(cpu)->scir.state = value; + uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); + } +} +#endif /* _ASM_X86_UV_UV_HUB_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h new file mode 100644 index 00000000000..59363627523 --- /dev/null +++ b/arch/x86/include/asm/virtext.h @@ -0,0 +1,132 @@ +/* CPU virtualization extensions handling + * + * This should carry the code for handling CPU virtualization extensions + * that needs to live in the kernel core. + * + * Author: Eduardo Habkost <ehabkost@redhat.com> + * + * Copyright (C) 2008, Red Hat Inc. + * + * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#ifndef _ASM_X86_VIRTEX_H +#define _ASM_X86_VIRTEX_H + +#include <asm/processor.h> +#include <asm/system.h> + +#include <asm/vmx.h> +#include <asm/svm.h> + +/* + * VMX functions: + */ + +static inline int cpu_has_vmx(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + + +/** Disable VMX on the current CPU + * + * vmxoff causes a undefined-opcode exception if vmxon was not run + * on the CPU previously. Only call this function if you know VMX + * is enabled. + */ +static inline void cpu_vmxoff(void) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + +static inline int cpu_vmx_enabled(void) +{ + return read_cr4() & X86_CR4_VMXE; +} + +/** Disable VMX if it is enabled on the current CPU + * + * You shouldn't call this if cpu_has_vmx() returns 0. + */ +static inline void __cpu_emergency_vmxoff(void) +{ + if (cpu_vmx_enabled()) + cpu_vmxoff(); +} + +/** Disable VMX if it is supported and enabled on the current CPU + */ +static inline void cpu_emergency_vmxoff(void) +{ + if (cpu_has_vmx()) + __cpu_emergency_vmxoff(); +} + + + + +/* + * SVM functions: + */ + +/** Check if the CPU has SVM support + * + * You can use the 'msg' arg to get a message describing the problem, + * if the function returns zero. Simply pass NULL if you are not interested + * on the messages; gcc should take care of not generating code for + * the messages on this case. + */ +static inline int cpu_has_svm(const char **msg) +{ + uint32_t eax, ebx, ecx, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + if (msg) + *msg = "not amd"; + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + if (msg) + *msg = "can't execute cpuid_8000000a"; + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (msg) + *msg = "svm not available"; + return 0; + } + return 1; +} + + +/** Disable SVM on the current CPU + * + * You should call this only if cpu_has_svm() returned true. + */ +static inline void cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); +} + +/** Makes sure SVM is disabled, if it is supported on the CPU + */ +static inline void cpu_emergency_svm_disable(void) +{ + if (cpu_has_svm(NULL)) + cpu_svm_disable(); +} + +#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h index b7c0dea119f..61e08c0a290 100644 --- a/arch/x86/include/asm/vmi.h +++ b/arch/x86/include/asm/vmi.h @@ -223,9 +223,15 @@ struct pci_header { } __attribute__((packed)); /* Function prototypes for bootstrapping */ +#ifdef CONFIG_VMI extern void vmi_init(void); +extern void vmi_activate(void); extern void vmi_bringup(void); -extern void vmi_apply_boot_page_allocations(void); +#else +static inline void vmi_init(void) {} +static inline void vmi_activate(void) {} +static inline void vmi_bringup(void) {} +#endif /* State needed to start an application processor in an SMP system. */ struct vmi_ap_state { diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h new file mode 100644 index 00000000000..c11b7e100d8 --- /dev/null +++ b/arch/x86/include/asm/vmware.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2008, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#ifndef ASM_X86__VMWARE_H +#define ASM_X86__VMWARE_H + +extern unsigned long vmware_get_tsc_khz(void); +extern int vmware_platform(void); +extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); + +#endif diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h index ec5edc339da..d0238e6151d 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -63,10 +63,13 @@ #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 /* VMCS Encodings */ enum vmcs_field { @@ -112,6 +115,8 @@ enum vmcs_field { VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -120,6 +125,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -331,8 +338,9 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 -#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 +#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 @@ -356,4 +364,19 @@ enum vmcs_field { #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + + + #endif diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 3f6000d95fe..5e79ca69432 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -33,8 +33,14 @@ #ifndef _ASM_X86_XEN_HYPERCALL_H #define _ASM_X86_XEN_HYPERCALL_H +#include <linux/kernel.h> +#include <linux/spinlock.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/types.h> + +#include <asm/page.h> +#include <asm/pgtable.h> #include <xen/interface/xen.h> #include <xen/interface/sched.h> diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index a38d25ac87d..81fbd735aec 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -33,39 +33,10 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H -#include <linux/types.h> -#include <linux/kernel.h> - -#include <xen/interface/xen.h> -#include <xen/interface/version.h> - -#include <asm/ptrace.h> -#include <asm/page.h> -#include <asm/desc.h> -#if defined(__i386__) -# ifdef CONFIG_X86_PAE -# include <asm-generic/pgtable-nopud.h> -# else -# include <asm-generic/pgtable-nopmd.h> -# endif -#endif -#include <asm/xen/hypercall.h> - /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -/* arch/i386/mach-xen/evtchn.c */ -/* Force a proper event-channel callback from Xen. */ -extern void force_evtchn_callback(void); - -/* Turn jiffies into Xen system time. */ -u64 jiffies_to_st(unsigned long jiffies); - - -#define MULTI_UVMFLAGS_INDEX 3 -#define MULTI_UVMDOMID_INDEX 4 - enum xen_domain_type { XEN_NATIVE, XEN_PV_DOMAIN, @@ -74,9 +45,15 @@ enum xen_domain_type { extern enum xen_domain_type xen_domain_type; +#ifdef CONFIG_XEN #define xen_domain() (xen_domain_type != XEN_NATIVE) -#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN) +#else +#define xen_domain() (0) +#endif + +#define xen_pv_domain() (xen_domain() && xen_domain_type == XEN_PV_DOMAIN) +#define xen_hvm_domain() (xen_domain() && xen_domain_type == XEN_HVM_DOMAIN) + #define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN) -#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #endif /* _ASM_X86_XEN_HYPERVISOR_H */ diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bc628998a1b..7ef617ef1df 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -1,11 +1,16 @@ #ifndef _ASM_X86_XEN_PAGE_H #define _ASM_X86_XEN_PAGE_H +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/spinlock.h> #include <linux/pfn.h> #include <asm/uaccess.h> +#include <asm/page.h> #include <asm/pgtable.h> +#include <xen/interface/xen.h> #include <xen/features.h> /* Xen machine address */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b62a7667828..d364df03c1d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg endif # @@ -23,9 +24,9 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) -obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o +obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o @@ -65,6 +66,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o @@ -105,6 +107,10 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + +obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64 + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -118,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o - obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4c51a2f8fd3..29dc0c89d4a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -538,9 +538,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *obj; struct acpi_madt_local_apic *lapic; - cpumask_t tmp_map, new_map; + cpumask_var_t tmp_map, new_map; u8 physid; int cpu; + int retval = -ENOMEM; if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) return -EINVAL; @@ -569,23 +570,37 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; - tmp_map = cpu_present_map; + if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&new_map, GFP_KERNEL)) + goto free_tmp_map; + + cpumask_copy(tmp_map, cpu_present_mask); acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); /* * If mp_register_lapic successfully generates a new logical cpu * number, then the following will get us exactly what was mapped */ - cpus_andnot(new_map, cpu_present_map, tmp_map); - if (cpus_empty(new_map)) { + cpumask_andnot(new_map, cpu_present_mask, tmp_map); + if (cpumask_empty(new_map)) { printk ("Unable to map lapic to logical cpu number\n"); - return -EINVAL; + retval = -EINVAL; + goto free_new_map; } - cpu = first_cpu(new_map); + cpu = cpumask_first(new_map); *pcpu = cpu; - return 0; + retval = 0; + +free_new_map: + free_cpumask_var(new_map); +free_tmp_map: + free_cpumask_var(tmp_map); +out: + return retval; } /* wrapper to silence section mismatch warning */ @@ -598,7 +613,7 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { per_cpu(x86_cpu_to_apicid, cpu) = -1; - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); num_processors--; return (0); @@ -1360,6 +1375,17 @@ static void __init acpi_process_madt(void) disable_acpi(); } } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); #endif return; } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e4899e0e878..5113c080f0c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -20,10 +20,15 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> +#ifdef CONFIG_IOMMU_API +#include <linux/iommu.h> +#endif #include <asm/proto.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -37,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); +#ifdef CONFIG_IOMMU_API +static struct iommu_ops amd_iommu_ops; +#endif + /* * general struct to manage commands send to an IOMMU */ @@ -46,6 +55,68 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +static struct dma_ops_domain *find_protection_domain(u16 devid); + + +#ifdef CONFIG_AMD_IOMMU_STATS + +/* + * Initialization code for statistics collection + */ + +DECLARE_STATS_COUNTER(compl_wait); +DECLARE_STATS_COUNTER(cnt_map_single); +DECLARE_STATS_COUNTER(cnt_unmap_single); +DECLARE_STATS_COUNTER(cnt_map_sg); +DECLARE_STATS_COUNTER(cnt_unmap_sg); +DECLARE_STATS_COUNTER(cnt_alloc_coherent); +DECLARE_STATS_COUNTER(cnt_free_coherent); +DECLARE_STATS_COUNTER(cross_page); +DECLARE_STATS_COUNTER(domain_flush_single); +DECLARE_STATS_COUNTER(domain_flush_all); +DECLARE_STATS_COUNTER(alloced_io_mem); +DECLARE_STATS_COUNTER(total_map_requests); + +static struct dentry *stats_dir; +static struct dentry *de_isolate; +static struct dentry *de_fflush; + +static void amd_iommu_stats_add(struct __iommu_counter *cnt) +{ + if (stats_dir == NULL) + return; + + cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, + &cnt->value); +} + +static void amd_iommu_stats_init(void) +{ + stats_dir = debugfs_create_dir("amd-iommu", NULL); + if (stats_dir == NULL) + return; + + de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, + (u32 *)&amd_iommu_isolate); + + de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, + (u32 *)&amd_iommu_unmap_flush); + + amd_iommu_stats_add(&compl_wait); + amd_iommu_stats_add(&cnt_map_single); + amd_iommu_stats_add(&cnt_unmap_single); + amd_iommu_stats_add(&cnt_map_sg); + amd_iommu_stats_add(&cnt_unmap_sg); + amd_iommu_stats_add(&cnt_alloc_coherent); + amd_iommu_stats_add(&cnt_free_coherent); + amd_iommu_stats_add(&cross_page); + amd_iommu_stats_add(&domain_flush_single); + amd_iommu_stats_add(&domain_flush_all); + amd_iommu_stats_add(&alloced_io_mem); + amd_iommu_stats_add(&total_map_requests); +} + +#endif /* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) @@ -187,12 +258,56 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command(iommu, cmd); + if (!ret) + iommu->need_sync = true; spin_unlock_irqrestore(&iommu->lock, flags); return ret; } /* + * This function waits until an IOMMU has completed a completion + * wait command + */ +static void __iommu_wait_for_completion(struct amd_iommu *iommu) +{ + int ready = 0; + unsigned status = 0; + unsigned long i = 0; + + INC_STATS_COUNTER(compl_wait); + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } + + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); +} + +/* + * This function queues a completion wait command into the command + * buffer of an IOMMU + */ +static int __iommu_completion_wait(struct amd_iommu *iommu) +{ + struct iommu_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + return __iommu_queue_command(iommu, &cmd); +} + +/* * This function is called whenever we need to ensure that the IOMMU has * completed execution of all commands we sent. It sends a * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs @@ -201,37 +316,23 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret = 0, ready = 0; - unsigned status = 0; - struct iommu_cmd cmd; - unsigned long flags, i = 0; + int ret = 0; + unsigned long flags; - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + spin_lock_irqsave(&iommu->lock, flags); - iommu->need_sync = 0; + if (!iommu->need_sync) + goto out; - spin_lock_irqsave(&iommu->lock, flags); + ret = __iommu_completion_wait(iommu); - ret = __iommu_queue_command(iommu, &cmd); + iommu->need_sync = false; if (ret) goto out; - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; - } - - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + __iommu_wait_for_completion(iommu); - if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) - printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); out: spin_unlock_irqrestore(&iommu->lock, flags); @@ -254,11 +355,24 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) ret = iommu_queue_command(iommu, &cmd); - iommu->need_sync = 1; - return ret; } +static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + u16 domid, int pde, int s) +{ + memset(cmd, 0, sizeof(*cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + /* * Generic command send function for invalidaing TLB entries */ @@ -268,21 +382,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, struct iommu_cmd cmd; int ret; - memset(&cmd, 0, sizeof(cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); - cmd.data[1] |= domid; - cmd.data[2] = lower_32_bits(address); - cmd.data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); ret = iommu_queue_command(iommu, &cmd); - iommu->need_sync = 1; - return ret; } @@ -318,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) { u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + INC_STATS_COUNTER(domain_flush_single); + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* + * This function is used to flush the IO/TLB for a given protection domain + * on every IOMMU in the system + */ +static void iommu_flush_domain(u16 domid) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(domain_flush_all); + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + list_for_each_entry(iommu, &amd_iommu_list, list) { + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -335,15 +464,15 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) * supporting all features of AMD IOMMU page tables like level skipping * and full 64 bit address spaces. */ -static int iommu_map(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - int prot) +static int iommu_map_page(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) { u64 __pte, *pte, *page; bus_addr = PAGE_ALIGN(bus_addr); - phys_addr = PAGE_ALIGN(bus_addr); + phys_addr = PAGE_ALIGN(phys_addr); /* only support 512GB address spaces for now */ if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) @@ -385,6 +514,28 @@ static int iommu_map(struct protection_domain *dom, return 0; } +static void iommu_unmap_page(struct protection_domain *dom, + unsigned long bus_addr) +{ + u64 *pte; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + *pte = 0; +} + /* * This function checks if a specific unity mapping entry is needed for * this specific IOMMU. @@ -437,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); if (ret) return ret; /* @@ -568,6 +719,16 @@ static u16 domain_id_alloc(void) return id; } +static void domain_id_free(int id) +{ + unsigned long flags; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + if (id > 0 && id < MAX_DOMAIN_ID) + __clear_bit(id, amd_iommu_pd_alloc_bitmap); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + /* * Used to reserve address ranges in the aperture (e.g. for exclusion * ranges. @@ -584,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, iommu_area_reserve(dom->bitmap, start_page, pages); } -static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) +static void free_pagetable(struct protection_domain *domain) { int i, j; u64 *p1, *p2, *p3; - p1 = dma_dom->domain.pt_root; + p1 = domain->pt_root; if (!p1) return; @@ -599,7 +760,7 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) continue; p2 = IOMMU_PTE_PAGE(p1[i]); - for (j = 0; j < 512; ++i) { + for (j = 0; j < 512; ++j) { if (!IOMMU_PTE_PRESENT(p2[j])) continue; p3 = IOMMU_PTE_PAGE(p2[j]); @@ -610,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) } free_page((unsigned long)p1); + + domain->pt_root = NULL; } /* @@ -621,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; - dma_ops_free_pagetable(dom); + free_pagetable(&dom->domain); kfree(dom->pte_pages); @@ -660,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, goto free_dma_dom; dma_dom->domain.mode = PAGE_MODE_3_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; @@ -722,6 +886,15 @@ free_dma_dom: } /* + * little helper function to check whether a given protection domain is a + * dma_ops domain + */ +static bool dma_ops_domain(struct protection_domain *domain) +{ + return domain->flags & PD_DMA_OPS_MASK; +} + +/* * Find out the protection domain structure for a given PCI device. This * will give us the pointer to the page table root for example. */ @@ -741,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid) * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void set_device_domain(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static void attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); + domain->dev_cnt += 1; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -762,10 +936,118 @@ static void set_device_domain(struct amd_iommu *iommu, write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); iommu_queue_inv_dev_entry(iommu, devid); +} + +/* + * Removes a device from a protection domain (unlocked) + */ +static void __detach_device(struct protection_domain *domain, u16 devid) +{ + + /* lock domain */ + spin_lock(&domain->lock); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + /* decrease reference counter */ + domain->dev_cnt -= 1; + + /* ready */ + spin_unlock(&domain->lock); +} + +/* + * Removes a device from a protection domain (with devtable_lock held) + */ +static void detach_device(struct protection_domain *domain, u16 devid) +{ + unsigned long flags; + + /* lock device table */ + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + __detach_device(domain, devid); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int device_change_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + struct protection_domain *domain; + struct dma_ops_domain *dma_domain; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + unsigned long flags; + + if (devid > amd_iommu_last_bdf) + goto out; + + devid = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + goto out; + + domain = domain_for_device(devid); + + if (domain && !dma_ops_domain(domain)) + WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " + "to a non-dma-ops domain\n", dev_name(dev)); + + switch (action) { + case BUS_NOTIFY_BOUND_DRIVER: + if (domain) + goto out; + dma_domain = find_protection_domain(devid); + if (!dma_domain) + dma_domain = iommu->default_dom; + attach_device(iommu, &dma_domain->domain, devid); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device %s\n", dma_domain->domain.id, dev_name(dev)); + break; + case BUS_NOTIFY_UNBIND_DRIVER: + if (!domain) + goto out; + detach_device(domain, devid); + break; + case BUS_NOTIFY_ADD_DEVICE: + /* allocate a protection domain if a device is added */ + dma_domain = find_protection_domain(devid); + if (dma_domain) + goto out; + dma_domain = dma_ops_domain_alloc(iommu, order); + if (!dma_domain) + goto out; + dma_domain->target_dev = devid; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + break; + default: + goto out; + } + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); - iommu->need_sync = 1; +out: + return 0; } +struct notifier_block device_nb = { + .notifier_call = device_change_notifier, +}; + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -801,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) list_for_each_entry(entry, &iommu_pd_list, list) { if (entry->target_dev == devid) { ret = entry; - list_del(&ret->list); break; } } @@ -852,12 +1133,14 @@ static int get_device_resources(struct device *dev, if (!dma_dom) dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; - set_device_domain(*iommu, *domain, *bdf); + attach_device(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device ", (*domain)->id); - print_devid(_bdf, 1); + "device %s\n", (*domain)->id, dev_name(dev)); } + if (domain_for_device(_bdf) == NULL) + attach_device(*iommu, *domain, _bdf); + return 1; } @@ -908,7 +1191,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, if (address >= dom->aperture_size) return; - WARN_ON(address & 0xfffULL || address > dom->aperture_size); + WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size); pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; pte += IOMMU_PTE_L0_INDEX(address); @@ -920,8 +1203,8 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, /* * This function contains common code for mapping of a physically - * contiguous memory region into DMA address space. It is uses by all - * mapping functions provided by this IOMMU driver. + * contiguous memory region into DMA address space. It is used by all + * mapping functions provided with this IOMMU driver. * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, @@ -942,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev, pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; + INC_STATS_COUNTER(total_map_requests); + + if (pages > 1) + INC_STATS_COUNTER(cross_page); + if (align) align_mask = (1UL << get_order(size)) - 1; @@ -958,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + ADD_STATS_COUNTER(alloced_io_mem, size); + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; @@ -981,7 +1271,8 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) + if ((dma_addr == bad_dma_address) || + (dma_addr + size > dma_dom->aperture_size)) return; pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); @@ -993,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu, start += PAGE_SIZE; } + SUB_STATS_COUNTER(alloced_io_mem, size); + dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { @@ -1014,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, dma_addr_t addr; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_single); + if (!check_device(dev)) return bad_dma_address; @@ -1025,14 +1320,16 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + if (!dma_ops_domain(domain)) + return bad_dma_address; + spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); if (addr == bad_dma_address) goto out; - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1051,17 +1348,21 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_unmap_single); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, dir); - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1101,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, int mapped_elems = 0; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_sg); + if (!check_device(dev)) return 0; @@ -1111,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, if (!iommu || !domain) return map_sg_no_iommu(dev, sglist, nelems, dir); + if (!dma_ops_domain(domain)) + return 0; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1127,8 +1433,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1161,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; + INC_STATS_COUNTER(cnt_unmap_sg); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1173,8 +1483,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1193,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size, phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; + INC_STATS_COUNTER(cnt_alloc_coherent); + if (!check_device(dev)) return NULL; @@ -1211,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size, return virt_addr; } + if (!dma_ops_domain(domain)) + goto out_free; + if (!dma_mask) dma_mask = *dev->dma_mask; @@ -1219,19 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { - free_pages((unsigned long)virt_addr, get_order(size)); - virt_addr = NULL; - goto out; - } + if (*dma_addr == bad_dma_address) + goto out_free; - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); -out: spin_unlock_irqrestore(&domain->lock, flags); return virt_addr; + +out_free: + + free_pages((unsigned long)virt_addr, get_order(size)); + + return NULL; } /* @@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_free_coherent); + if (!check_device(dev)) return; @@ -1253,12 +1570,14 @@ static void free_coherent(struct device *dev, size_t size, if (!iommu || !domain) goto free_mem; + if (!dma_ops_domain(domain)) + goto free_mem; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - if (unlikely(iommu->need_sync)) - iommu_completion_wait(iommu); + iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -1297,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) * we don't need to preallocate the protection domains anymore. * For now we have to. */ -void prealloc_protection_domains(void) +static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; @@ -1306,7 +1625,7 @@ void prealloc_protection_domains(void) u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = (dev->bus->number << 8) | dev->devfn; + devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -1353,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; + iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; ret = iommu_init_unity_mappings(iommu); if (ret) goto free_domains; @@ -1376,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; + register_iommu(&amd_iommu_ops); + + bus_register_notifier(&pci_bus_type, &device_nb); + + amd_iommu_stats_init(); + return 0; free_domains: @@ -1387,3 +1713,224 @@ free_domains: return ret; } + +/***************************************************************************** + * + * The following functions belong to the exported interface of AMD IOMMU + * + * This interface allows access to lower level functions of the IOMMU + * like protection domain handling and assignement of devices to domains + * which is not possible with the dma_ops interface. + * + *****************************************************************************/ + +static void cleanup_domain(struct protection_domain *domain) +{ + unsigned long flags; + u16 devid; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) + if (amd_iommu_pd_table[devid] == domain) + __detach_device(domain, devid); + + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + spin_lock_init(&domain->lock); + domain->mode = PAGE_MODE_3_LEVEL; + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_free; + domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!domain->pt_root) + goto out_free; + + dom->priv = domain; + + return 0; + +out_free: + kfree(domain); + + return -ENOMEM; +} + +static void amd_iommu_domain_destroy(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + + if (!domain) + return; + + if (domain->dev_cnt > 0) + cleanup_domain(domain); + + BUG_ON(domain->dev_cnt != 0); + + free_pagetable(domain); + + domain_id_free(domain->id); + + kfree(domain); + + dom->priv = NULL; +} + +static void amd_iommu_detach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid > 0) + detach_device(domain, devid); + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return; + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); +} + +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct protection_domain *old_domain; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return -EINVAL; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid >= amd_iommu_last_bdf || + devid != amd_iommu_alias_table[devid]) + return -EINVAL; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -EINVAL; + + old_domain = domain_for_device(devid); + if (old_domain) + return -EBUSY; + + attach_device(iommu, domain, devid); + + iommu_completion_wait(iommu); + + return 0; +} + +static int amd_iommu_map_range(struct iommu_domain *dom, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot) +{ + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); + int prot = 0; + int ret; + + if (iommu_prot & IOMMU_READ) + prot |= IOMMU_PROT_IR; + if (iommu_prot & IOMMU_WRITE) + prot |= IOMMU_PROT_IW; + + iova &= PAGE_MASK; + paddr &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + ret = iommu_map_page(domain, iova, paddr, prot); + if (ret) + return ret; + + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return 0; +} + +static void amd_iommu_unmap_range(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); + + iova &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + iommu_unmap_page(domain, iova); + iova += PAGE_SIZE; + } + + iommu_flush_domain(domain->id); +} + +static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, + unsigned long iova) +{ + struct protection_domain *domain = dom->priv; + unsigned long offset = iova & ~PAGE_MASK; + phys_addr_t paddr; + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + paddr = *pte & IOMMU_PAGE_MASK; + paddr |= offset; + + return paddr; +} + +static struct iommu_ops amd_iommu_ops = { + .domain_init = amd_iommu_domain_init, + .domain_destroy = amd_iommu_domain_destroy, + .attach_dev = amd_iommu_attach_device, + .detach_dev = amd_iommu_detach_device, + .map = amd_iommu_map_range, + .unmap = amd_iommu_unmap_range, + .iova_to_phys = amd_iommu_iova_to_phys, +}; + diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 30ae2701b3d..42c33cebf00 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -28,6 +28,7 @@ #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> #include <asm/iommu.h> +#include <asm/gart.h> /* * definitions for the ACPI scanning code @@ -121,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ -int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ +bool amd_iommu_isolate = true; /* if true, device isolation is + enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -242,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -void __init iommu_enable(struct amd_iommu *iommu) +static void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " - "at %02x:%02x.%x cap 0x%hx\n", - iommu->dev->bus->number, - PCI_SLOT(iommu->dev->devfn), - PCI_FUNC(iommu->dev->devfn), - iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } /* Function to enable IOMMU event logging and event interrupts */ -void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void __init iommu_enable_event_logging(struct amd_iommu *iommu) { iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); @@ -427,6 +425,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); return cmd_buf; @@ -1074,7 +1076,8 @@ int __init amd_iommu_init(void) goto free; /* IOMMU rlookup table - find the IOMMU for a specific device */ - amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_rlookup_table = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_rlookup_table == NULL) goto free; @@ -1212,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = 1; + amd_iommu_isolate = true; if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = 0; + amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9a32b37ee2e..676debfc170 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,8 +1,9 @@ /* * Firmware replacement code. * - * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * * If all fails map the aperture over some low memory. This is cheaper than * doing bounce buffering. The memory is lost. This is done at early boot * because only the bootmem allocator can allocate 32+MB. diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..b13d3c4dbd4 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/dmar.h> +#include <linux/ftrace.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -97,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer); #ifdef HAVE_X2APIC int x2apic; /* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; +static int x2apic_preenabled; +static int disable_x2apic; static __init int setup_nox2apic(char *str) { disable_x2apic = 1; @@ -118,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); int first_system_vector = 0xfe; -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - /* * Debug level, exported for io_apic.c */ @@ -141,7 +140,7 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); +static void lapic_timer_broadcast(const struct cpumask *mask); static void apic_pm_activate(void); /* @@ -227,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id) apic_write(APIC_ICR, low); } -u64 xapic_icr_read(void) +static u64 xapic_icr_read(void) { u32 icr1, icr2; @@ -267,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id) wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } -u64 x2apic_icr_read(void) +static u64 x2apic_icr_read(void) { unsigned long val; @@ -441,6 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0xffffffff); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -453,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, /* * Local APIC timer broadcast function */ -static void lapic_timer_broadcast(cpumask_t mask) +static void lapic_timer_broadcast(const struct cpumask *mask) { #ifdef CONFIG_SMP send_IPI_mask(mask, LOCAL_TIMER_VECTOR); @@ -469,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void) struct clock_event_device *levt = &__get_cpu_var(lapic_events); memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); + levt->cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(levt); } @@ -559,13 +559,13 @@ static int __init calibrate_by_pmtimer(long deltapm, long *delta) } else { res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " + pr_warning("APIC calibration not consistent " "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + pr_info("APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long)res, *delta); *delta = (long)res; } @@ -645,8 +645,7 @@ static int __init calibrate_APIC_clock(void) */ if (calibration_result < (1000000 / HZ)) { local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -672,13 +671,9 @@ static int __init calibrate_APIC_clock(void) while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); - local_irq_disable(); - /* Stop the lapic timer */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - local_irq_enable(); - /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); @@ -692,8 +687,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); + pr_warning("APIC timer disabled due to verification failure.\n"); return -1; } @@ -714,7 +708,7 @@ void __init setup_boot_APIC_clock(void) * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); + pr_info("Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; @@ -741,7 +735,7 @@ void __init setup_boot_APIC_clock(void) if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; else - printk(KERN_WARNING "APIC timer registered as dummy," + pr_warning("APIC timer registered as dummy," " due to nmi_watchdog=%d!\n", nmi_watchdog); /* Setup the lapic or request the broadcast */ @@ -773,8 +767,7 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); /* Switch it off */ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); return; @@ -783,11 +776,7 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif + inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); } @@ -800,7 +789,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs *regs) +void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -814,9 +803,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); @@ -1093,7 +1080,7 @@ static void __cpuinit lapic_setup_esr(void) unsigned int oldvalue, value, maxlvt; if (!lapic_is_integrated()) { - printk(KERN_INFO "No ESR for 82489DX.\n"); + pr_info("No ESR for 82489DX.\n"); return; } @@ -1104,7 +1091,7 @@ static void __cpuinit lapic_setup_esr(void) * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ - printk(KERN_INFO "Leaving ESR disabled.\n"); + pr_info("Leaving ESR disabled.\n"); return; } @@ -1298,7 +1285,7 @@ void check_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (msr & X2APIC_ENABLE) { - printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); x2apic_preenabled = x2apic = 1; apic_ops = &x2apic_ops; } @@ -1310,7 +1297,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - printk("Enabling x2apic\n"); + pr_info("Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } @@ -1325,9 +1312,8 @@ void __init enable_IR_x2apic(void) return; if (!x2apic_preenabled && disable_x2apic) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of nox2apic\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); return; } @@ -1335,22 +1321,19 @@ void __init enable_IR_x2apic(void) panic("Bios already enabled x2apic, can't enforce nox2apic"); if (!x2apic_preenabled && skip_ioapic_setup) { - printk(KERN_INFO - "Skipped enabling x2apic and Interrupt-remapping " - "because of skipping io-apic setup\n"); + pr_info("Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); return; } ret = dmar_table_init(); if (ret) { - printk(KERN_INFO - "dmar_table_init() failed with %d:\n", ret); + pr_info("dmar_table_init() failed with %d:\n", ret); if (x2apic_preenabled) panic("x2apic enabled by bios. But IR enabling failed"); else - printk(KERN_INFO - "Not enabling x2apic,Intr-remapping\n"); + pr_info("Not enabling x2apic,Intr-remapping\n"); return; } @@ -1359,7 +1342,7 @@ void __init enable_IR_x2apic(void) ret = save_mask_IO_APIC_setup(); if (ret) { - printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + pr_info("Saving IO-APIC state failed: %d\n", ret); goto end; } @@ -1394,14 +1377,11 @@ end: if (!ret) { if (!x2apic_preenabled) - printk(KERN_INFO - "Enabled x2apic and interrupt-remapping\n"); + pr_info("Enabled x2apic and interrupt-remapping\n"); else - printk(KERN_INFO - "Enabled Interrupt-remapping\n"); + pr_info("Enabled Interrupt-remapping\n"); } else - printk(KERN_ERR - "Failed to enable Interrupt-remapping and x2apic\n"); + pr_err("Failed to enable Interrupt-remapping and x2apic\n"); #else if (!cpu_has_x2apic) return; @@ -1410,8 +1390,8 @@ end: panic("x2apic enabled prior OS handover," " enable CONFIG_INTR_REMAP"); - printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " - " and x2apic\n"); + pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); #endif return; @@ -1428,7 +1408,7 @@ end: static int __init detect_init_APIC(void) { if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); + pr_info("No local APIC present\n"); return -1; } @@ -1469,8 +1449,8 @@ static int __init detect_init_APIC(void) * "lapic" specified. */ if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); return -1; } /* @@ -1480,8 +1460,7 @@ static int __init detect_init_APIC(void) */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -1494,7 +1473,7 @@ static int __init detect_init_APIC(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); + pr_warning("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -1505,14 +1484,14 @@ static int __init detect_init_APIC(void) if (l & MSR_IA32_APICBASE_ENABLE) mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - printk(KERN_INFO "Found and enabled local APIC!\n"); + pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); + pr_info("No local APIC present or hardware disabled\n"); return -1; } #endif @@ -1588,12 +1567,12 @@ int __init APIC_init_uniprocessor(void) { #ifdef CONFIG_X86_64 if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); + pr_info("Apic disabled\n"); return -1; } if (!cpu_has_apic) { disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); + pr_info("Apic disabled by BIOS\n"); return -1; } #else @@ -1605,8 +1584,8 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", - boot_cpu_physical_apicid); + pr_err("BIOS bug, local APIC 0x%x not detected!...\n", + boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1682,9 +1661,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1695,14 +1672,11 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else + inc_irq_stat(irq_spurious_count); + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif + pr_info("spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } @@ -1713,9 +1687,7 @@ void smp_error_interrupt(struct pt_regs *regs) { u32 v, v1; -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1724,17 +1696,18 @@ void smp_error_interrupt(struct pt_regs *regs) ack_APIC_irq(); atomic_inc(&irq_err_count); - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + /* + * Here is what the APIC error bits mean: + * 0: Send CS error + * 1: Receive CS error + * 2: Send accept error + * 3: Receive accept error + * 4: Reserved + * 5: Send illegal vector + * 6: Received illegal vector + * 7: Illegal register address + */ + pr_debug("APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1832,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup) void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - cpumask_t tmp_map; /* * Validate version */ if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", + pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", version); version = 0x10; } apic_version[apicid] = version; - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + if (num_processors >= nr_cpu_ids) { + int max = nr_cpu_ids; + int thiscpu = max + disabled_cpus; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i reached." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; return; } num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); + cpu = cpumask_next_zero(-1, cpu_present_mask); physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { @@ -1903,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version) } #endif - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); } #ifdef CONFIG_X86_64 @@ -2106,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void) bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { /* are we being called early in kernel startup? */ if (bios_cpu_apicid) { id = bios_cpu_apicid[i]; @@ -2209,7 +2186,7 @@ static int __init apic_set_verbosity(char *arg) else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" + pr_warning("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bb..3a26525a3f3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off; #else static int power_off = 1; #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else static int realmode_power_off; -#endif #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6649d09ad88..ee4df08feee 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -11,7 +11,7 @@ #include <linux/suspend.h> #include <linux/kbuild.h> #include <asm/ucontext.h> -#include "sigframe.h" +#include <asm/sigframe.h> #include <asm/pgtable.h> #include <asm/fixmap.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 7fcf63d22f8..1d41d3f1edb 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -20,6 +20,8 @@ #include <xen/interface/xen.h> +#include <asm/sigframe.h> + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_UNISTD_64_H @@ -87,7 +89,7 @@ int main(void) BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); + offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); BLANK(); #endif DEFINE(pbe_address, offsetof(struct pbe, address)); diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index f0dfe6f17e7..f63882728d9 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -25,7 +25,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> -struct uv_systab uv_systab; +static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { @@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); -long uv_coherency_id; -EXPORT_SYMBOL_GPL(uv_coherency_id); -long uv_region_size; -EXPORT_SYMBOL_GPL(uv_region_size); +long sn_coherency_id; +EXPORT_SYMBOL_GPL(sn_coherency_id); +long sn_region_size; +EXPORT_SYMBOL_GPL(sn_region_size); int uv_type; @@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, return ret; } +int +uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, + unsigned long *intr_mmr_offset) +{ + union uv_watchlist_u size_blade; + u64 watchlist; + s64 ret; + + size_blade.size = mq_size; + size_blade.blade = blade; + + /* + * bios returns watchlist number or negative error number. + */ + ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, + size_blade.val, (u64)intr_mmr_offset, + (u64)&watchlist, 0); + if (ret < BIOS_STATUS_SUCCESS) + return ret; + + return watchlist; +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); + +int +uv_bios_mq_watchlist_free(int blade, int watchlist_num) +{ + return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, + blade, watchlist_num, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); + +s64 +uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) +{ + return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, + perms, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); + +s64 +uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) +{ + s64 ret; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, + (u64)addr, buf, (u64)len, 0); + return ret; +} +EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 00000000000..2ac0ab71412 --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,161 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kthread.h> +#include <linux/workqueue.h> +#include <asm/e820.h> +#include <asm/proto.h> + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + + +static __init int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static __init int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static __init int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", + num_scan_areas); + update_e820(); +} + + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) +{ + check_for_bios_corruption(); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); +} + +static int start_periodic_check_for_corruption(void) +{ + if (!memory_corruption_check || corruption_check_period == 0) + return 0; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + return 0; +} + +module_init(start_periodic_check_for_corruption); + diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..82db7f45e2d 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -2,8 +2,14 @@ # Makefile for x86-compatible CPU details and quirks # +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +endif + obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o +obj-y += vmware.o hypervisor.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index ef8f831af82..2cf23634b6d 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(c->initial_apicid, 0); #else c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; c->phys_proc_id = phys_pkg_id(core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = phys_pkg_id(0); #endif c->x86_max_cores = (core_level_siblings / smp_num_siblings); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8f1e31db2ad..7c878f6aa91 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..3f95a40f718 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -36,6 +36,7 @@ #include <asm/proto.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/hypervisor.h> #include "cpu.h" @@ -354,7 +355,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1) { - if (smp_num_siblings > NR_CPUS) { + if (smp_num_siblings > nr_cpu_ids) { printk(KERN_WARNING "CPU: Unsupported number of siblings %d", smp_num_siblings); smp_num_siblings = 1; @@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) detect_ht(c); #endif + init_hypervisor(c); /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -862,7 +864,7 @@ EXPORT_SYMBOL(_cpu_pda); struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; +static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; void __cpuinit pda_init(int cpu) { @@ -903,8 +905,8 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; +static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d4467..28102ad1a36 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <linux/acpi.h> #include <acpi/processor.h> @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; + struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } + trace_power_mark(&it, POWER_PSTATE, next_perf_state); + switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; @@ -513,6 +517,17 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) } } +static void free_acpi_perf_data(void) +{ + unsigned int i; + + /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ + for_each_possible_cpu(i) + free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) + ->shared_cpu_map); + free_percpu(acpi_perf_data); +} + /* * acpi_cpufreq_early_init - initialize ACPI P-States library * @@ -523,6 +538,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) */ static int __init acpi_cpufreq_early_init(void) { + unsigned int i; dprintk("acpi_cpufreq_early_init\n"); acpi_perf_data = alloc_percpu(struct acpi_processor_performance); @@ -530,6 +546,16 @@ static int __init acpi_cpufreq_early_init(void) dprintk("Memory allocation error for acpi_perf_data.\n"); return -ENOMEM; } + for_each_possible_cpu(i) { + if (!alloc_cpumask_var_node( + &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, + GFP_KERNEL, cpu_to_node(i))) { + + /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ + free_acpi_perf_data(); + return -ENOMEM; + } + } /* Do initialization in ACPI core */ acpi_processor_preregister_performance(acpi_perf_data); @@ -600,9 +626,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) */ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - policy->cpus = perf->shared_cpu_map; + cpumask_copy(&policy->cpus, perf->shared_cpu_map); } - policy->related_cpus = perf->shared_cpu_map; + cpumask_copy(&policy->related_cpus, perf->shared_cpu_map); #ifdef CONFIG_SMP dmi_check_system(sw_any_bug_dmi_table); @@ -791,7 +817,7 @@ static int __init acpi_cpufreq_init(void) ret = cpufreq_register_driver(&acpi_cpufreq_driver); if (ret) - free_percpu(acpi_perf_data); + free_acpi_perf_data(); return ret; } diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index b8e05ee4f73..beea4466b06 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -160,6 +160,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x0E: /* Core */ case 0x0F: /* Core Duo */ + case 0x16: /* Celeron Core */ p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE); case 0x0D: /* Pentium M (Dothan) */ @@ -171,7 +172,9 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } if (c->x86 != 0xF) { - printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n"); + if (!cpu_has(c, X86_FEATURE_EST)) + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. " + "Please send an e-mail to <cpufreq@vger.kernel.org>\n"); return 0; } @@ -274,6 +277,7 @@ static struct cpufreq_driver p4clockmod_driver = { .name = "p4-clockmod", .owner = THIS_MODULE, .attr = p4clockmod_attr, + .hide_interface = 1, }; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 7c7d56b4313..1b446d79a8f 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -310,6 +310,12 @@ static int powernow_acpi_init(void) goto err0; } + if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, + GFP_KERNEL)) { + retval = -ENOMEM; + goto err05; + } + if (acpi_processor_register_performance(acpi_processor_perf, 0)) { retval = -EIO; goto err1; @@ -412,6 +418,8 @@ static int powernow_acpi_init(void) err2: acpi_processor_unregister_performance(acpi_processor_perf, 0); err1: + free_cpumask_var(acpi_processor_perf->shared_cpu_map); +err05: kfree(acpi_processor_perf); err0: printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n"); @@ -652,6 +660,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) { #ifdef CONFIG_X86_POWERNOW_K7_ACPI if (acpi_processor_perf) { acpi_processor_unregister_performance(acpi_processor_perf, 0); + free_cpumask_var(acpi_processor_perf->shared_cpu_map); kfree(acpi_processor_perf); } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7f05f44b97e..c3c9adbaa26 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -766,7 +766,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; - int ret_val; + int ret_val = -ENODEV; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -815,6 +815,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); + if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { + printk(KERN_ERR PFX + "unable to alloc powernow_k8_data cpumask\n"); + ret_val = -ENOMEM; + goto err_out_mem; + } + return 0; err_out_mem: @@ -826,7 +833,7 @@ err_out: /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ data->acpi_data.state_count = 0; - return -ENODEV; + return ret_val; } static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) @@ -929,6 +936,7 @@ static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { if (data->acpi_data.state_count) acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + free_cpumask_var(data->acpi_data.shared_cpu_map); } #else @@ -1134,7 +1142,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) data->cpu = pol->cpu; data->currpstate = HW_PSTATE_INVALID; - if (powernow_k8_cpu_init_acpi(data)) { + rc = powernow_k8_cpu_init_acpi(data); + if (rc) { /* * Use the PSB BIOS structure. This is only availabe on * an UP version, and is deprecated by AMD. @@ -1152,20 +1161,17 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI maintainers and complain to your BIOS " "vendor.\n"); #endif - kfree(data); - return -ENODEV; + goto err_out; } if (pol->cpu != 0) { printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " "CPU other than CPU0. Complain to your BIOS " "vendor.\n"); - kfree(data); - return -ENODEV; + goto err_out; } rc = find_psb_table(data); if (rc) { - kfree(data); - return -ENODEV; + goto err_out; } } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 3b5f06423e7..f0ea6fa2f53 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -459,9 +459,7 @@ static int centrino_verify (struct cpufreq_policy *policy) * Sets a new CPUFreq policy. */ struct allmasks { - cpumask_t online_policy_cpus; cpumask_t saved_mask; - cpumask_t set_mask; cpumask_t covered_cpus; }; @@ -475,9 +473,7 @@ static int centrino_target (struct cpufreq_policy *policy, int retval = 0; unsigned int j, k, first_cpu, tmp; CPUMASK_ALLOC(allmasks); - CPUMASK_PTR(online_policy_cpus, allmasks); CPUMASK_PTR(saved_mask, allmasks); - CPUMASK_PTR(set_mask, allmasks); CPUMASK_PTR(covered_cpus, allmasks); if (unlikely(allmasks == NULL)) @@ -497,30 +493,28 @@ static int centrino_target (struct cpufreq_policy *policy, goto out; } -#ifdef CONFIG_HOTPLUG_CPU - /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); -#else - *online_policy_cpus = policy->cpus; -#endif - *saved_mask = current->cpus_allowed; first_cpu = 1; cpus_clear(*covered_cpus); - for_each_cpu_mask_nr(j, *online_policy_cpus) { + for_each_cpu_mask_nr(j, policy->cpus) { + const cpumask_t *mask; + + /* cpufreq holds the hotplug lock, so we are safe here */ + if (!cpu_online(j)) + continue; + /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(*set_mask, *set_mask, *online_policy_cpus); + mask = &policy->cpus; else - cpu_set(j, *set_mask); + mask = &cpumask_of_cpu(j); - set_cpus_allowed_ptr(current, set_mask); + set_cpus_allowed_ptr(current, mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -548,7 +542,9 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask_nr(k, *online_policy_cpus) { + for_each_cpu_mask_nr(k, policy->cpus) { + if (!cpu_online(k)) + continue; freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -571,7 +567,9 @@ static int centrino_target (struct cpufreq_policy *policy, preempt_enable(); } - for_each_cpu_mask_nr(k, *online_policy_cpus) { + for_each_cpu_mask_nr(k, policy->cpus) { + if (!cpu_online(k)) + continue; freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -584,18 +582,17 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(*covered_cpus)) - for_each_cpu_mask_nr(j, *covered_cpus) { - set_cpus_allowed_ptr(current, - &cpumask_of_cpu(j)); - wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); - } + for_each_cpu_mask_nr(j, *covered_cpus) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); + wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); + } tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask_nr(j, *online_policy_cpus) { - freqs.cpu = j; + for_each_cpu_mask_nr(j, policy->cpus) { + if (!cpu_online(j)) + continue; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index 98d4fdb7dc0..cdac7d62369 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -139,6 +139,15 @@ static unsigned int pentium_core_get_frequency(void) case 3: fsb = 166667; break; + case 2: + fsb = 200000; + break; + case 0: + fsb = 266667; + break; + case 4: + fsb = 333333; + break; default: printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 00000000000..fb5b86af0b0 --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,58 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/vmware.h> +#include <asm/hypervisor.h> + +static inline void __cpuinit +detect_hypervisor_vendor(struct cpuinfo_x86 *c) +{ + if (vmware_platform()) { + c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; + } else { + c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; + } +} + +unsigned long get_hypervisor_tsc_freq(void) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + return vmware_get_tsc_khz(); + return 0; +} + +static inline void __cpuinit +hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { + vmware_set_feature_bits(c); + return; + } +} + +void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +{ + detect_hypervisor_vendor(c); + hypervisor_set_feature_bits(c); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d5..8ea6929e974 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,6 @@ #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> -#include <asm/ptrace.h> #include <asm/ds.h> #include <asm/bugs.h> @@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; #endif + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + } #ifdef CONFIG_X86_32 @@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -307,13 +323,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P4); if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); - - if (cpu_has_bts) - ptrace_bts_init_intel(c); - #endif - detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* * let's use the legacy cpuid vector 0x1 and 0x4 for topology diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3f46afbb1cf..48533d77be7 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(cpuid4_info, cpu) = NULL; } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static void __cpuinit get_cpu_leaves(void *_retval) { - struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; - cpumask_t oldmask; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) - return -ENOMEM; - - oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - if (retval) - goto out; + int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf; this_leaf = CPUID4_INFO_IDX(cpu, j); - retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) { + *retval = cpuid4_cache_lookup(j, this_leaf); + if (unlikely(*retval < 0)) { int i; for (i = 0; i < j; i++) @@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) } cache_shared_cpu_map_setup(cpu, j); } - set_cpus_allowed_ptr(current, &oldmask); +} + +static int __cpuinit detect_cache_attributes(unsigned int cpu) +{ + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; -out: + per_cpu(cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(cpuid4_info, cpu) == NULL) + return -ENOMEM; + + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { kfree(per_cpu(cpuid4_info, cpu)); per_cpu(cpuid4_info, cpu) = NULL; @@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, cpumask_t *mask = &this_leaf->shared_cpu_map; n = type? - cpulist_scnprintf(buf, len-2, *mask): - cpumask_scnprintf(buf, len-2, *mask); + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; buf[n] = '\0'; } @@ -644,20 +641,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { - switch(this_leaf->eax.split.type) { - case CACHE_TYPE_DATA: +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +{ + switch (this_leaf->eax.split.type) { + case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); - break; - case CACHE_TYPE_INST: + case CACHE_TYPE_INST: return sprintf(buf, "Instruction\n"); - break; - case CACHE_TYPE_UNIFIED: + case CACHE_TYPE_UNIFIED: return sprintf(buf, "Unified\n"); - break; - default: + default: return sprintf(buf, "Unknown\n"); - break; } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4b031a4ac85..1c838032fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) */ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) { - static cpumask_t mce_cpus = CPU_MASK_NONE; - mce_cpu_quirks(c); if (mce_dont_init || - cpu_test_and_set(smp_processor_id(), mce_cpus) || !mce_available(c)) return; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 5eb390a4b2e..a5a5e053037 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ * CPU Initialization */ +struct thresh_restart { + struct threshold_block *b; + int reset; + u16 old_limit; +}; + /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_block *b, - int reset, u16 old_limit) +static long threshold_restart_bank(void *_tr) { + struct thresh_restart *tr = _tr; u32 mci_misc_hi, mci_misc_lo; - rdmsr(b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); - if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) - reset = 1; /* limit cannot be lower than err count */ + if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ - if (reset) { /* reset err count and overflow bit */ + if (tr->reset) { /* reset err count and overflow bit */ mci_misc_hi = (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - b->threshold_limit); - } else if (old_limit) { /* change limit w/o reset */ + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (mci_misc_hi & THRESHOLD_MAX) + - (old_limit - b->threshold_limit); + (tr->old_limit - tr->b->threshold_limit); mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - b->interrupt_enable ? + tr->b->interrupt_enable ? (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(b->address, mci_misc_lo, mci_misc_hi); + wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + return 0; } /* cpu init entry point, called from mce.c with preempt off */ @@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int cpu = smp_processor_id(); u8 lvt_off; u32 low = 0, high = 0, address = 0; + struct thresh_restart tr; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) wrmsr(address, low, high); threshold_defaults.address = address; - threshold_restart_bank(&threshold_defaults, 0, 0); + tr.b = &threshold_defaults; + tr.reset = 0; + tr.old_limit = 0; + threshold_restart_bank(&tr); } } } @@ -237,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: - add_pda(irq_threshold_count, 1); + inc_irq_stat(irq_threshold_count); irq_exit(); } @@ -251,20 +262,6 @@ struct threshold_attr { ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static void affinity_set(unsigned int cpu, cpumask_t *oldmask, - cpumask_t *newmask) -{ - *oldmask = current->cpus_allowed; - cpus_clear(*newmask); - cpu_set(cpu, *newmask); - set_cpus_allowed_ptr(current, newmask); -} - -static void affinity_restore(const cpumask_t *oldmask) -{ - set_cpus_allowed_ptr(current, oldmask); -} - #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ { \ @@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, 0); - affinity_restore(&oldmask); + tr.b = b; + tr.reset = 0; + tr.old_limit = 0; + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } @@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask, newmask; - u16 old; + struct thresh_restart tr; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; @@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b, new = THRESHOLD_MAX; if (new < 1) new = 1; - old = b->threshold_limit; + tr.old_limit = b->threshold_limit; b->threshold_limit = new; + tr.b = b; + tr.reset = 0; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 0, old); - affinity_restore(&oldmask); + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return end - buf; } -static ssize_t show_error_count(struct threshold_block *b, char *buf) +static long local_error_count(void *_b) { - u32 high, low; - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); + struct threshold_block *b = _b; + u32 low, high; + rdmsr(b->address, low, high); - affinity_restore(&oldmask); - return sprintf(buf, "%x\n", - (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); + return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b)); } static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask, newmask; - affinity_set(b->cpu, &oldmask, &newmask); - threshold_restart_bank(b, 1, 0); - affinity_restore(&oldmask); + struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + + work_on_cpu(b->cpu, threshold_restart_bank, &tr); return 1; } @@ -463,12 +462,19 @@ out_free: return err; } +static long local_allocate_threshold_blocks(void *_bank) +{ + unsigned int *bank = _bank; + + return allocate_threshold_blocks(smp_processor_id(), *bank, 0, + MSR_IA32_MC0_MISC + *bank * 4); +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - affinity_set(cpu, &oldmask, &newmask); - err = allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(&oldmask); - + err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank); if (err) goto out_free; diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index c17eaf5dd6d..4b48f251fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); - add_pda(irq_thermal_count, 1); + inc_irq_stat(irq_thermal_count); irq_exit(); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4e8d77f01ee..b59ddcc88cd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -14,14 +14,6 @@ #include <asm/pat.h> #include "mtrr.h" -struct mtrr_state { - struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; - mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ @@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = { }; static unsigned long smp_changes_mask; -static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; u64 mtrr_tom2; +struct mtrr_state_type mtrr_state = {}; +EXPORT_SYMBOL_GPL(mtrr_state); + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c78c04821ea..d259e5d2e05 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -49,7 +49,7 @@ u32 num_var_ranges = 0; -unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; +static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { @@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, } static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; #ifdef CONFIG_MTRR_SANITIZER @@ -823,16 +824,14 @@ static int enable_mtrr_cleanup __initdata = static int __init disable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 0; + enable_mtrr_cleanup = 0; return 0; } early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); static int __init enable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 1; + enable_mtrr_cleanup = 1; return 0; } early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); @@ -1206,39 +1205,43 @@ struct mtrr_cleanup_result { #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; -static struct res_range __initdata range_new[RANGE_NUM]; static unsigned long __initdata min_loss_pfn[RANGE_NUM]; -static int __init mtrr_cleanup(unsigned address_bits) +static void __init print_out_mtrr_range_state(void) { - unsigned long extra_remove_base, extra_remove_size; - unsigned long base, size, def, dummy; - mtrr_type type; - int nr_range, nr_range_new; - u64 chunk_size, gran_size; - unsigned long range_sums, range_sums_new; - int index_good; - int num_reg_good; int i; + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; - /* extra one for all 0 */ - int num[MTRR_NUM_TYPES + 1]; + for (i = 0; i < num_var_ranges; i++) { - if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) - return 0; - rdmsr(MTRRdefType_MSR, def, dummy); - def &= 0xff; - if (def != MTRR_TYPE_UNCACHABLE) - return 0; + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; - /* get it and store it aside */ - memset(range_state, 0, sizeof(range_state)); - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, &base, &size, &type); - range_state[i].base_pfn = base; - range_state[i].size_pfn = size; - range_state[i].type = type; + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; /* check entries number */ memset(num, 0, sizeof(num)); @@ -1263,29 +1266,133 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; - /* print original var MTRRs at first, for debugging: */ - printk(KERN_DEBUG "original variable MTRRs\n"); - for (i = 0; i < num_var_ranges; i++) { - char start_factor = 'K', size_factor = 'K'; - unsigned long start_base, size_base; + return 1; +} - size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); - if (!size_base) - continue; +static unsigned long __initdata range_sums; +static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long extra_remove_base, + unsigned long extra_remove_size, + int i) +{ + int num_reg; + static struct res_range range_new[RANGE_NUM]; + static int nr_range_new; + unsigned long range_sums_new; + + /* convert ranges to var ranges state */ + num_reg = x86_setup_var_mtrrs(range, nr_range, + chunk_size, gran_size); + + /* we got new setting in range_state, check it */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + extra_remove_base, extra_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = + (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else + result[i].lose_cover_sizek = + (range_sums - range_sums_new) << PSHIFT; - size_base = to_size_factor(size_base, &size_factor), - start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); - start_base = to_size_factor(start_base, &start_factor), - type = range_state[i].type; + /* double check it */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || + memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } - printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", - i, start_base, start_factor, - size_base, size_factor, - (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRPROT) ? "WP" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) - ); + if (!result[i].bad && (range_sums - range_sums_new < + min_loss_pfn[num_reg])) { + min_loss_pfn[num_reg] = + range_sums - range_sums_new; } +} + +static void __init mtrr_print_out_one_result(int i) +{ + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int i; + int num_reg_good; + int index_good; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + + +static int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long extra_remove_base, extra_remove_size; + unsigned long base, size, def, dummy; + mtrr_type type; + u64 chunk_size, gran_size; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* get it and store it aside */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* check if we need handle it and can handle it */ + if (!mtrr_need_cleanup()) + return 0; + + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); extra_remove_size = 0; @@ -1309,176 +1416,64 @@ static int __init mtrr_cleanup(unsigned address_bits) range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { - int num_reg; - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - debug_print++; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, - mtrr_gran_size); + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + extra_remove_base, extra_remove_size, i); - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, - extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); + mtrr_print_out_one_result(i); - i = 0; - result[i].chunk_sizek = mtrr_chunk_size >> 10; - result[i].gran_sizek = mtrr_gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print--; - memset(result, 0, sizeof(result[0])); } i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { - char gran_factor; - unsigned long gran_base; - - if (debug_print) - gran_base = to_size_factor(gran_size >> 10, &gran_factor); for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { - int num_reg; - if (debug_print) { - char chunk_factor; - unsigned long chunk_base; - - chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), - printk(KERN_INFO "\n"); - printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", - gran_base, gran_factor, chunk_base, chunk_factor); - } if (i >= NUM_RESULT) continue; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); - - /* we got new setting in range_state, check it */ - memset(range_new, 0, sizeof(range_new)); - nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); - range_sums_new = sum_ranges(range_new, nr_range_new); - - result[i].chunk_sizek = chunk_size >> 10; - result[i].gran_sizek = gran_size >> 10; - result[i].num_reg = num_reg; - if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; - result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; - - /* double check it */ - if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + mtrr_calc_range_state(chunk_size, gran_size, + extra_remove_base, extra_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } i++; } } - /* print out all */ - for (i = 0; i < NUM_RESULT; i++) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad?"*BAD*":" ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad?"-":"", - lose_base, lose_factor); - } - /* try to find the optimal index */ - if (nr_mtrr_spare_reg >= num_var_ranges) - nr_mtrr_spare_reg = num_var_ranges - 1; - num_reg_good = -1; - for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) - num_reg_good = i; - } - - index_good = -1; - if (num_reg_good != -1) { - for (i = 0; i < NUM_RESULT; i++) { - if (!result[i].bad && - result[i].num_reg == num_reg_good && - !result[i].lose_cover_sizek) { - index_good = i; - break; - } - } - } + index_good = mtrr_search_optimal_index(); if (index_good != -1) { - char gran_factor, chunk_factor, lose_factor; - unsigned long gran_base, chunk_base, lose_base; - printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", - result[i].num_reg, lose_base, lose_factor); + mtrr_print_out_one_result(i); + /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - debug_print--; set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); } printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); @@ -1562,7 +1557,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) { unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; - int nr_range; u64 total_trim_size; /* extra one for all 0 */ diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2dc4ec656b2..ffd60409cc6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -8,11 +8,6 @@ #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -#define NUM_FIXED_RANGES 88 -#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -29,11 +24,7 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef u8 mtrr_type; - -extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; @@ -70,13 +61,6 @@ struct set_mtrr_context { u32 ccr3; }; -struct mtrr_var_range { - u32 base_lo; - u32 base_hi; - u32 mask_lo; - u32 mask_hi; -}; - void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 00000000000..284c399e323 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,112 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dmi.h> +#include <asm/div64.h> +#include <asm/vmware.h> + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long __vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; +} + +/* + * While checking the dmi string infomation, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ +int vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax, ebx, ecx, edx; + char hyper_vendor_id[13]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); + memcpy(hyper_vendor_id + 0, &ebx, 4); + memcpy(hyper_vendor_id + 4, &ecx, 4); + memcpy(hyper_vendor_id + 8, &edx, 4); + hyper_vendor_id[12] = '\0'; + if (!strcmp(hyper_vendor_id, "VMwareVMware")) + return 1; + } else if (dmi_available && dmi_name_in_serial("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +unsigned long vmware_get_tsc_khz(void) +{ + BUG_ON(!vmware_platform()); + return __vmware_get_tsc_khz(); +} + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 72cefd1e649..2ac1f0c2beb 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -39,10 +39,10 @@ #include <linux/device.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/system.h> static struct class *cpuid_class; @@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) } static ssize_t cpuid_read(struct file *file, char __user *buf, - size_t count, loff_t * ppos) + size_t count, loff_t *ppos) { char __user *tmp = buf; struct cpuid_regs cmd; @@ -117,11 +117,11 @@ static int cpuid_open(struct inode *inode, struct file *file) unsigned int cpu; struct cpuinfo_x86 *c; int ret = 0; - + lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= NR_CPUS || !cpu_online(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { ret = -ENXIO; /* No such CPU */ goto out; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 26855381790..c689d19e35a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,37 +26,21 @@ #include <linux/kdebug.h> #include <asm/smp.h> #include <asm/reboot.h> +#include <asm/virtext.h> #include <mach_ipi.h> -/* This keeps a track of which one is crashing cpu. */ -static int crashing_cpu; #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) -static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct notifier_block *self, - unsigned long val, void *data) +static void kdump_nmi_callback(int cpu, struct die_args *args) { struct pt_regs *regs; #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; #endif - int cpu; - if (val != DIE_NMI_IPI) - return NOTIFY_OK; - - regs = ((struct die_args *)data)->regs; - cpu = raw_smp_processor_id(); - - /* Don't do anything if this handler is invoked on crashing cpu. - * Otherwise, system will completely hang. Crashing cpu can get - * an NMI if system was initially booted with nmi_watchdog parameter. - */ - if (cpu == crashing_cpu) - return NOTIFY_STOP; - local_irq_disable(); + regs = args->regs; #ifdef CONFIG_X86_32 if (!user_mode_vm(regs)) { @@ -65,54 +49,28 @@ static int crash_nmi_callback(struct notifier_block *self, } #endif crash_save_cpu(regs, cpu); - disable_local_APIC(); - atomic_dec(&waiting_for_crash_ipi); - /* Assume hlt works */ - halt(); - for (;;) - cpu_relax(); - return 1; -} + /* Disable VMX or SVM if needed. + * + * We need to disable virtualization on all CPUs. + * Having VMX or SVM enabled on any CPU may break rebooting + * after the kdump kernel has finished its task. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); + disable_local_APIC(); } -static struct notifier_block crash_nmi_nb = { - .notifier_call = crash_nmi_callback, -}; - -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { - unsigned long msecs; - - atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_die_notifier(&crash_nmi_nb)) - return; /* return what? */ - /* Ensure the new callback function is set before sending - * out the NMI - */ - wmb(); - - smp_send_nmi_allbutself(); - - msecs = 1000; /* Wait at most a second for the other cpus to stop */ - while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { - mdelay(1); - msecs--; - } + nmi_shootdown_cpus(kdump_nmi_callback); - /* Leave the nmi callback set */ disable_local_APIC(); } + #else -static void nmi_shootdown_cpus(void) +static void kdump_nmi_shootdown_cpus(void) { /* There are no cpus to shootdown */ } @@ -131,9 +89,15 @@ void native_machine_crash_shutdown(struct pt_regs *regs) /* The kernel is broken so disable interrupts */ local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); - nmi_shootdown_cpus(); + kdump_nmi_shootdown_cpus(); + + /* Booting kdump kernel with VMX or SVM enabled won't work, + * because (among other limitations) we can't disable paging + * with the virt flags. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38e..da91701a234 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,14 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - DS and BTS hardware configuration + * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -28,22 +27,69 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/kernel.h> /* * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + + +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. + * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); - -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} +static DEFINE_SPINLOCK(ds_lock); /* @@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context, * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. - */ -static DEFINE_PER_CPU(struct ds_context *, system_context); - -#define this_system_context per_cpu(system_context, smp_processor_id()) - -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. */ -static inline struct ds_context *ds_get_context(struct task_struct *task) -{ - struct ds_context *context; - unsigned long irq; +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; - spin_lock_irqsave(&ds_lock, irq); +static DEFINE_PER_CPU(struct ds_context *, system_context_array); - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; +#define system_context per_cpu(system_context_array, smp_processor_id()) - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); - struct ds_context *context = *p_context; + (task ? &task->thread.ds_ctx : &system_context); + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irqsave(&ds_lock, irq); - if (*p_context) { - kfree(context->ds); - kfree(context); + context = *p_context; + if (!context) { + context = new_context; - context = *p_context; - } else { - *p_context = context; + context->this = p_context; + context->task = task; + context->count = 0; - context->this = p_context; - context->task = task; + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } - spin_unlock_irqrestore(&ds_lock, irq); + *p_context = context; } context->count++; + spin_unlock_irqrestore(&ds_lock, irq); + + if (context != new_context) + kfree(new_context); + return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); + spin_unlock_irqrestore(&ds_lock, irq); - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); - out: - spin_unlock_irqrestore(&ds_lock, irq); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ + switch (qual) { + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); + break; + } } /* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. + * Write raw data into the BTS or PEBS buffer. * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. + * The remainder of any partially written record is zeroed out. * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer; + int bytes_written = 0; - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + if (!record) + return -EINVAL; - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - return NULL; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - return NULL; + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - return NULL; + write_end = min(end, int_th); - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; - if (pages) - *pages = pgsz; + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); - return buffer; + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) { - struct ds_context *context; - unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); - unsigned long irq; - int error = 0; + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} - if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; - /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) return -EINVAL; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (at < tracer->trace.ds.begin) + return -EINVAL; + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; - context = ds_alloc_context(task); - if (!context) - return -ENOMEM; + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; + } - spin_lock_irqsave(&ds_lock, irq); + return ds_cfg.sizeof_rec[ds_bts]; +} - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; - get_tracer(task); + if (!tracer) + return -EINVAL; - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; - error = -EPERM; - if (context->owner[qual] != NULL) - goto out_put_tracer; - context->owner[qual] = current; + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; - spin_unlock_irqrestore(&ds_lock, irq); + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); + break; + default: + return -EINVAL; + } + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); +} - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - context->buffer[qual] = base; - } - error = 0; +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; - context->callback[qual] = ovfl; + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { + unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment * constraints: @@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size, */ buffer = (unsigned long)base; - adj = ALIGN(buffer, alignment) - buffer; + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); + size = (trace->n * trace->size); - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; - - out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - put_tracer(task); - return error; + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + trace->ith = (void *)(buffer + size - ith); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - return error; + trace->flags = flags; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_bts); -} -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); -} - -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EINVAL; + if (!base) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; - - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; - - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); - out: - ds_put_context(context); - return error; -} + /* we require some space to do alignment adjustments below */ + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; -int ds_release_bts(struct task_struct *task) -{ - return ds_release(task, ds_bts); -} + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; -int ds_release_pebs(struct task_struct *task) -{ - return ds_release(task, ds_pebs); -} + error = -EINVAL; + if (size <= th) + goto out; + } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, index; - int error; + tracer->buffer = base; + tracer->size = size; + error = -ENOMEM; context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + if (!context) goto out; + tracer->context = context; - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); + ds_init_ds_trace(trace, qual, base, size, th, flags); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; + error = 0; out: - ds_put_context(context); return error; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) -{ - return ds_get_index(task, pos, ds_bts); -} - -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - return ds_get_index(task, pos, ds_pebs); -} - -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, end; + struct bts_tracer *tracer; + unsigned long irq; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) goto out; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; -} + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; -int ds_get_bts_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_bts); -} + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); + if (error < 0) + goto out_tracer; -int ds_get_pebs_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_pebs); -} -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, idx; - int error; + spin_lock_irqsave(&ds_lock, irq); - if (!record) - return -EINVAL; + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); + spin_unlock_irqrestore(&ds_lock, irq); - error = -EINVAL; - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; - *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; -} + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; -int ds_access_bts(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_bts); -} + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_pebs); + return tracer; + + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - struct ds_context *context; + struct pebs_tracer *tracer; + unsigned long irq; int error; - if (!record) - return -EINVAL; + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = -EPERM; - context = ds_get_context(task); - if (!context) + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) goto out; + tracer->ovfl = ovfl; - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); + if (error < 0) + goto out_tracer; - error = 0; - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; + spin_lock_irqsave(&ds_lock, irq); - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - write_end = min(end, int_th); + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; + spin_unlock_irqrestore(&ds_lock, irq); - if (write_end <= index) - goto out; + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); + return tracer; - record = (const char *)record + write_size; - size -= write_size; - error += write_size; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); +} - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; +void ds_release_bts(struct bts_tracer *tracer) +{ + if (!tracer) + return; - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + ds_suspend_bts(tracer); - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; - if (index >= int_th) - ds_overflow(task, context, qual); - } + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - out: - ds_put_context(context); - return error; + kfree(tracer); } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +void ds_suspend_bts(struct bts_tracer *tracer) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + struct task_struct *task; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); -} + if (!tracer) + return; -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + task = tracer->ds.context->task; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; + + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +void ds_resume_bts(struct bts_tracer *tracer) { - struct ds_context *context; - unsigned long base, end; - int error; + struct task_struct *task; + unsigned long control; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + if (!tracer) + return; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + task = tracer->ds.context->task; - if (clear) - memset((void *)base, 0, end - base); + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; - ds_set(context->ds, qual, ds_index, base); + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - error = 0; - out: - ds_put_context(context); - return error; + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_reset_bts(struct task_struct *task) +void ds_release_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return; + + ds_suspend_pebs(tracer); + + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; + + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); + + kfree(tracer); } -int ds_reset_pebs(struct task_struct *task) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + } -int ds_clear_bts(struct task_struct *task) +void ds_resume_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + } -int ds_clear_pebs(struct task_struct *task) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - if (!value) + return &tracer->trace; +} + +int ds_reset_bts(struct bts_tracer *tracer) +{ + if (!tracer) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_reset_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; +} + +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +{ + if (!tracer) + return -EINVAL; + + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + + return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; + + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); + + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ - ds_configure(&ds_cfg_64); - break; - default: - /* sorry, don't know about them */ + default: /* Core2, Atom, ... */ + ds_configure(&ds_cfg_core2); break; } break; @@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -878,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) +{ + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); + } + + update_debugctlmsr(next->thread.debugctlmsr); +} + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) - ds_put_context(context); + WARN_ON(tsk->thread.ds_ctx); } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..6b1f6f6f866 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task = tinfo->task; + unsigned long ret_addr; + int index = task->curr_ret_stack; + + if (addr != (unsigned long)return_to_handler) + return; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + stack++; + } + return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + get_bp(bp); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + sysfs_printk_last_file(); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); +#ifdef CONFIG_X86_32 + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + flags = oops_begin(); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + oops_end(flags, regs, 0); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 00000000000..da87590b869 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197..d593cd1f58d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,69 +17,14 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} +#include "dumpstack.h" void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + int graph = 0; + if (!task) task = current; @@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL); + bp = print_context_stack(context, stack, bp, ops, + data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { @@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - bust_spinlocks(0); - do_exit(SIGSEGV); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a..c302d070704 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@ #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%p>] %s%pS\n", (void *) address, - reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h" static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) @@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; + int graph = 0; if (!task) task = current; @@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); + data, estack_end, &graph); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); + ops, data, irqstack_end, &graph); /* * We link to the next stack (which would be * the process stack normally) the last @@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk("%s <%s> ", (char *)data, name); - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk(data); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { @@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - void show_registers(struct pt_regs *regs) { int i; @@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (in_interrupt()) - panic("Fatal exception in interrupt"); - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - sysfs_printk_last_file(); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7aafeb5263e..65a13943e09 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -677,22 +677,6 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE) - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" }, -#endif -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" }, - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" }, -#endif {} }; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 1b894b72c0f..744aa7fc49d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@ #include <asm/io_apic.h> #include <asm/apic.h> #include <asm/iommu.h> +#include <asm/gart.h> static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 34ad997d383..504ad198e4a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -875,49 +875,6 @@ static struct console early_dbgp_console = { }; #endif -/* Console interface to a host file on AMD's SimNow! */ - -static int simnow_fd; - -enum { - MAGIC1 = 0xBACCD00A, - MAGIC2 = 0xCA110000, - XOPEN = 5, - XWRITE = 4, -}; - -static noinline long simnow(long cmd, long a, long b, long c) -{ - long ret; - - asm volatile("cpuid" : - "=a" (ret) : - "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); - return ret; -} - -static void __init simnow_init(char *str) -{ - char *fn = "klog"; - - if (*str == '=') - fn = ++str; - /* error ignored */ - simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); -} - -static void simnow_write(struct console *con, const char *s, unsigned n) -{ - simnow(XWRITE, simnow_fd, (unsigned long)s, n); -} - -static struct console simnow_console = { - .name = "simnow", - .write = simnow_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; @@ -929,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...) va_list ap; va_start(ap, fmt); - n = vscnprintf(buf, 512, fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); early_console->write(early_console, buf, n); va_end(ap); } @@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf) max_ypos = boot_params.screen_info.orig_video_lines; current_ypos = boot_params.screen_info.orig_y; early_console = &early_vga_console; - } else if (!strncmp(buf, "simnow", 6)) { - simnow_init(buf + 6); - early_console = &simnow_console; - keep_early = 1; #ifdef CONFIG_EARLY_PRINTK_DBGP } else if (!strncmp(buf, "dbgp", 4)) { if (early_dbgp_init(buf+4) < 0) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..d6f0490a739 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -619,28 +619,37 @@ END(syscall_badsys) 27:; /* - * Build the entry stubs and pointer table with - * some assembler magic. + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ -.section .rodata,"a" +.section .init.rodata,"a" ENTRY(interrupt) .text - + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) RING0_INT_FRAME -vector=0 -.rept NR_VECTORS - ALIGN - .if vector +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl $~(vector) + .endif +1: pushl $(~vector+0x80) /* Note: always in signed byte range */ CFI_ADJUST_CFA_OFFSET 4 - jmp common_interrupt - .previous + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous .long 1b - .text + .text vector=vector+1 + .endif + .endr +2: jmp common_interrupt .endr END(irq_entries_start) @@ -652,8 +661,9 @@ END(interrupt) * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: */ - ALIGN + .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ SAVE_ALL TRACE_IRQS_OFF movl %esp,%eax @@ -678,65 +688,6 @@ ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 - ALIGN -error_code: - /* the function address is in %fs's slot on the stack */ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET ebx, 0 - cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs - UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - TRACE_IRQS_OFF - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(page_fault) - ENTRY(coprocessor_error) RING0_INT_FRAME pushl $0 @@ -767,140 +718,6 @@ ENTRY(device_not_available) CFI_ENDPROC END(device_not_available) -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_sp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $__KERNEL_CS; \ - CFI_ADJUST_CFA_OFFSET 4; \ - pushl $sysenter_past_esp; \ - CFI_ADJUST_CFA_OFFSET 4; \ - CFI_REL_OFFSET eip, 0 - -KPROBE_ENTRY(debug) - RING0_INT_FRAME - cmpl $ia32_sysenter_target,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(debug) - -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -KPROBE_ENTRY(nmi) - RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - je nmi_espfix_stack - cmpl $ia32_sysenter_target,(%esp) - je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - jae nmi_stack_correct - cmpl $ia32_sysenter_target,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - jmp restore_nocheck_notrace - CFI_ENDPROC - -nmi_stack_fixup: - RING0_INT_FRAME - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_debug_stack_check: - /* We have a RING0_INT_FRAME here */ - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug,(%esp) - jb nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - ja nmi_stack_correct - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct - -nmi_espfix_stack: - /* We have a RING0_INT_FRAME here. - * - * create the pointer to lss back - */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 - .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to espfix stack - CFI_ADJUST_CFA_OFFSET -24 - jmp irq_return - CFI_ENDPROC -KPROBE_END(nmi) - #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret @@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit) END(native_irq_enable_sysexit) #endif -KPROBE_ENTRY(int3) - RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - TRACE_IRQS_OFF - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - jmp ret_from_exception - CFI_ENDPROC -KPROBE_END(int3) - ENTRY(overflow) RING0_INT_FRAME pushl $0 @@ -993,14 +797,6 @@ ENTRY(stack_segment) CFI_ENDPROC END(stack_segment) -KPROBE_ENTRY(general_protection) - RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 - jmp error_code - CFI_ENDPROC -KPROBE_END(general_protection) - ENTRY(alignment_check) RING0_EC_FRAME pushl $do_alignment_check @@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper) push %eax CFI_ADJUST_CFA_OFFSET 4 call do_exit + ud2 # padding for call trace CFI_ENDPROC ENDPROC(kernel_thread_helper) @@ -1157,6 +954,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1171,6 +971,11 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -1180,8 +985,18 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif .globl ftrace_stub ftrace_stub: ret @@ -1200,13 +1015,268 @@ trace: popl %edx popl %ecx popl %eax - jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif + .section .rodata,"a" #include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 + +ENTRY(debug) + RING0_INT_FRAME + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(general_protection) + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..e28c7a98779 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -11,15 +11,15 @@ * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is + * + * Normal syscalls and interrupts don't save a full stack frame, this is * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. + * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better @@ -60,7 +60,6 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 - #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -68,16 +67,10 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + MCOUNT_SAVE_FRAME movq 0x38(%rsp), %rdi movq 8(%rbp), %rsi @@ -87,14 +80,13 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + MCOUNT_RESTORE_FRAME + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif .globl ftrace_stub ftrace_stub: @@ -103,15 +95,63 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq trace: - /* taken from glibc */ - subq $0x38, %rsp + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + movq %rax, (%rsp) movq %rcx, 8(%rsp) movq %rdx, 16(%rsp) @@ -119,13 +159,14 @@ trace: movq %rdi, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi - - call *ftrace_trace_function + call ftrace_return_to_handler + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 movq 48(%rsp), %r9 movq 40(%rsp), %r8 movq 32(%rsp), %rdi @@ -133,16 +174,14 @@ trace: movq 16(%rsp), %rdx movq 8(%rsp), %rcx movq (%rsp), %rax - addq $0x38, %rsp + addq $72, %rsp + retq +#endif - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args -#endif +#endif #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) @@ -161,29 +200,29 @@ ENTRY(native_usergs_sysret64) .endm /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based * fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq %gs:pda_oldrsp,\tmp - movq \tmp,RSP(%rsp) - movq $__USER_DS,SS(%rsp) - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - movq R11(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS(%rsp) + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP+\offset(%rsp) + movq $__USER_DS,SS+\offset(%rsp) + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + movq R11+\offset(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS+\offset(%rsp) .endm - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - movq RSP-\offset(%rsp),\tmp - movq \tmp,%gs:pda_oldrsp - movq EFLAGS-\offset(%rsp),\tmp - movq \tmp,R11-\offset(%rsp) + .macro RESTORE_TOP_OF_STACK tmp offset=0 + movq RSP+\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS+\offset(%rsp),\tmp + movq \tmp,R11+\offset(%rsp) .endm .macro FAKE_STACK_FRAME child_rip @@ -195,7 +234,7 @@ ENTRY(native_usergs_sysret64) pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rsp,0 - pushq $(1<<9) /* eflags - interrupts on */ + pushq $X86_EFLAGS_IF /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ @@ -213,62 +252,184 @@ ENTRY(native_usergs_sysret64) CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK start=1 +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME start=1 offset=0 .if \start - CFI_STARTPROC simple + CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 + CFI_DEF_CFA rsp,8+\offset .else - CFI_DEF_CFA_OFFSET SS+8 + CFI_DEF_CFA_OFFSET 8+\offset .endif - CFI_REL_OFFSET r15,R15 - CFI_REL_OFFSET r14,R14 - CFI_REL_OFFSET r13,R13 - CFI_REL_OFFSET r12,R12 - CFI_REL_OFFSET rbp,RBP - CFI_REL_OFFSET rbx,RBX - CFI_REL_OFFSET r11,R11 - CFI_REL_OFFSET r10,R10 - CFI_REL_OFFSET r9,R9 - CFI_REL_OFFSET r8,R8 - CFI_REL_OFFSET rax,RAX - CFI_REL_OFFSET rcx,RCX - CFI_REL_OFFSET rdx,RDX - CFI_REL_OFFSET rsi,RSI - CFI_REL_OFFSET rdi,RDI - CFI_REL_OFFSET rip,RIP - /*CFI_REL_OFFSET cs,CS*/ - /*CFI_REL_OFFSET rflags,EFLAGS*/ - CFI_REL_OFFSET rsp,RSP - /*CFI_REL_OFFSET ss,SS*/ .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + EMPTY_FRAME \start, SS+8+\offset-RIP + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, RIP+\offset-ORIG_RAX + /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ + .endm + /* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + PARTIAL_FRAME \start, R11+\offset-R15 + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm + +/* save partial stack frame */ +ENTRY(save_args) + XCPT_FRAME + cld + movq_cfi rdi, RDI+16-ARGOFFSET + movq_cfi rsi, RSI+16-ARGOFFSET + movq_cfi rdx, RDX+16-ARGOFFSET + movq_cfi rcx, RCX+16-ARGOFFSET + movq_cfi rax, RAX+16-ARGOFFSET + movq_cfi r8, R8+16-ARGOFFSET + movq_cfi r9, R9+16-ARGOFFSET + movq_cfi r10, R10+16-ARGOFFSET + movq_cfi r11, R11+16-ARGOFFSET + + leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + movq_cfi rbp, 8 /* push %rbp */ + leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irqcount is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl %gs:pda_irqcount + jne 2f + popq_cfi %rax /* move return address... */ + mov %gs:pda_irqstackptr,%rsp + EMPTY_FRAME 0 + pushq_cfi %rax /* ... to the new stack */ + /* + * We entered an interrupt context - irqs are off: + */ +2: TRACE_IRQS_OFF + ret + CFI_ENDPROC +END(save_args) + +ENTRY(save_rest) + PARTIAL_FRAME 1 REST_SKIP+8 + movq 5*8+16(%rsp), %r11 /* save return address */ + movq_cfi rbx, RBX+16 + movq_cfi rbp, RBP+16 + movq_cfi r12, R12+16 + movq_cfi r13, R13+16 + movq_cfi r14, R14+16 + movq_cfi r15, R15+16 + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + +/* save complete stack frame */ +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ ENTRY(ret_from_fork) - CFI_DEFAULT_STACK + DEFAULT_FRAME + push kernel_eflags(%rip) CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags + popf # reset kernel eflags CFI_ADJUST_CFA_OFFSET -8 - call schedule_tail + + call schedule_tail # rdi: 'prev' task parameter + GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) - jnz rff_trace -rff_action: + + CFI_REMEMBER_STATE RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? je int_ret_from_sys_call - testl $_TIF_IA32,TI_flags(%rcx) + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + + CFI_RESTORE_STATE CFI_ENDPROC END(ret_from_fork) @@ -278,20 +439,20 @@ END(ret_from_fork) * SYSCALL does not save anything on the stack and does not change the * stack pointer. */ - + /* - * Register setup: + * Register setup: * rax system call number * rdi arg0 - * rcx return address for syscall/sysret, C arg3 + * rcx return address for syscall/sysret, C arg3 * rsi arg1 - * rdx arg2 + * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * + * r12-r15,rbp,rbx saved by C code, not touched. + * * Interrupts are off on entry. * Only called from user space. * @@ -301,7 +462,7 @@ END(ret_from_fork) * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. - */ + */ ENTRY(system_call) CFI_STARTPROC simple @@ -317,7 +478,7 @@ ENTRY(system_call) */ ENTRY(system_call_after_swapgs) - movq %rsp,%gs:pda_oldrsp + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight @@ -325,7 +486,7 @@ ENTRY(system_call_after_swapgs) */ ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) @@ -339,19 +500,19 @@ system_call_fastpath: movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ + * Has incomplete stack frame and undefined top of stack. + */ ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ -sysret_check: +sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl TI_flags(%rcx),%edx andl %edi,%edx - jnz sysret_careful + jnz sysret_careful CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: @@ -366,7 +527,7 @@ sysret_check: CFI_RESTORE_STATE /* Handle reschedules */ - /* edx: work, edi: workmask */ + /* edx: work, edi: workmask */ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal @@ -379,7 +540,7 @@ sysret_careful: CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check - /* Handle a signal */ + /* Handle a signal */ sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -388,17 +549,20 @@ sysret_signal: jc sysret_audit #endif /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 - call ptregscall_common + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call do_notify_resume + RESTORE_TOP_OF_STACK %r11 + RESTORE_REST movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check - + badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call @@ -437,7 +601,7 @@ sysret_audit: #endif /* CONFIG_AUDITSYSCALL */ /* Do syscall tracing */ -tracesys: +tracesys: #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) jz auditsys @@ -460,8 +624,8 @@ tracesys: call *sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - -/* + +/* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ @@ -505,18 +669,18 @@ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - /* Check for syscall exit trace */ + /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 - leaq 8(%rsp),%rdi # &ptregs -> arg1 + leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest - + int_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -531,22 +695,24 @@ int_restore_rest: jmp int_with_check CFI_ENDPROC END(system_call) - -/* + +/* * Certain special system calls that need to save a complete full stack frame. - */ - + */ .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME 0 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC END(\label) .endm - CFI_STARTPROC - PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -554,25 +720,18 @@ END(\label) PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 - SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ CFI_ENDPROC END(ptregscall_common) - + ENTRY(stub_execve) CFI_STARTPROC popq %r11 @@ -588,11 +747,11 @@ ENTRY(stub_execve) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. - */ + */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp @@ -608,70 +767,70 @@ ENTRY(stub_rt_sigreturn) END(stub_rt_sigreturn) /* - * initial frame state for interrupts and exceptions + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. */ - .macro _frame ref - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-\ref - /*CFI_REL_OFFSET ss,SS-\ref*/ - CFI_REL_OFFSET rsp,RSP-\ref - /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ - /*CFI_REL_OFFSET cs,CS-\ref*/ - CFI_REL_OFFSET rip,RIP-\ref - .endm + .section .init.rodata,"a" +ENTRY(interrupt) + .text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + INTR_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -8 + .endif +1: pushq $(~vector+0x80) /* Note: always in signed byte range */ + CFI_ADJUST_CFA_OFFSET 8 + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .quad 1b + .text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr + CFI_ENDPROC +END(irq_entries_start) -/* initial frame state for interrupts (and exceptions without error code) */ -#define INTR_FRAME _frame RIP -/* initial frame state for exceptions with error code (and interrupts with - vector already pushed) */ -#define XCPT_FRAME _frame ORIG_RAX +.previous +END(interrupt) +.previous -/* +/* * Interrupt entry/exit. * * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ + * + * Entry runs with interrupts off. + */ -/* 0(%rsp): interrupt number */ +/* 0(%rsp): ~(interrupt number) */ .macro interrupt func - cld - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler - pushq %rbp - /* - * Save rbp twice: One is for marking the stack frame, as usual, and the - * other, to fill pt_regs properly. This is because bx comes right - * before the last saved register in that structure, and not bp. If the - * base pointer were in the place bx is today, this would not be needed. - */ - movq %rbp, -8(%rsp) - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rbp, 0 - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp - testl $3,CS(%rdi) - je 1f - SWAPGS - /* irqcount is used to check if a CPU is already on an interrupt - stack or not. While this is essentially redundant with preempt_count - it is a little cheaper to use a separate counter in the PDA - (short of moving irq_enter into assembly, which would be too - much work) */ -1: incl %gs:pda_irqcount - cmoveq %gs:pda_irqstackptr,%rsp - push %rbp # backlink for old unwinder - /* - * We entered an interrupt context - irqs are off: - */ - TRACE_IRQS_OFF + subq $10*8, %rsp + CFI_ADJUST_CFA_OFFSET 10*8 + call save_args + PARTIAL_FRAME 0 call \func .endm -ENTRY(common_interrupt) + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: XCPT_FRAME + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: @@ -685,12 +844,12 @@ exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel - + /* Interrupt came from user space */ /* * Has a correct top of stack, but a partial stack frame * %rcx: thread info. Interrupts off. - */ + */ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: @@ -763,20 +922,20 @@ retint_careful: pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule - popq %rdi + popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check - + retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST - movq $-1,ORIG_RAX(%rsp) + movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume @@ -798,324 +957,211 @@ ENTRY(retint_kernel) jnc retint_restore_args call preempt_schedule_irq jmp exit_intr -#endif +#endif CFI_ENDPROC END(common_interrupt) - + /* * APIC interrupts. - */ - .macro apicinterrupt num,func + */ +.macro apicinterrupt num sym do_sym +ENTRY(\sym) INTR_FRAME pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 - interrupt \func + interrupt \do_sym jmp ret_from_intr CFI_ENDPROC - .endm - -ENTRY(thermal_interrupt) - apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt -END(thermal_interrupt) - -ENTRY(threshold_interrupt) - apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt -END(threshold_interrupt) - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -END(reschedule_interrupt) - - .macro INVALIDATE_ENTRY num -ENTRY(invalidate_interrupt\num) - apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt -END(invalidate_interrupt\num) - .endm +END(\sym) +.endm - INVALIDATE_ENTRY 0 - INVALIDATE_ENTRY 1 - INVALIDATE_ENTRY 2 - INVALIDATE_ENTRY 3 - INVALIDATE_ENTRY 4 - INVALIDATE_ENTRY 5 - INVALIDATE_ENTRY 6 - INVALIDATE_ENTRY 7 - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -END(call_function_interrupt) -ENTRY(call_function_single_interrupt) - apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt -END(call_function_single_interrupt) -ENTRY(irq_move_cleanup_interrupt) - apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt -END(irq_move_cleanup_interrupt) +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt #endif -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt -END(apic_timer_interrupt) +apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt + +#ifdef CONFIG_SMP +apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ + invalidate_interrupt0 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ + invalidate_interrupt1 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ + invalidate_interrupt2 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ + invalidate_interrupt3 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ + invalidate_interrupt4 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ + invalidate_interrupt5 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ + invalidate_interrupt6 smp_invalidate_interrupt +apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ + invalidate_interrupt7 smp_invalidate_interrupt +#endif -ENTRY(uv_bau_message_intr1) - apicinterrupt 220,uv_bau_message_interrupt -END(uv_bau_message_intr1) +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt mce_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt -END(error_interrupt) +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -END(spurious_interrupt) - /* * Exception entry points. - */ - .macro zeroentry sym + */ +.macro zeroentry sym do_sym +ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 /* push error code/oldrax */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* push real oldrax to the rdi slot */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm - .macro errorentry sym - XCPT_FRAME +.macro paranoidzeroentry sym do_sym +ENTRY(\sym) + INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq %rax + pushq $-1 /* ORIG_RAX: no syscall to restart */ CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rax,0 - leaq \sym(%rip),%rax - jmp error_entry + subq $15*8, %rsp + call save_paranoid + TRACE_IRQS_OFF + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm - /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0, irqtrace=1 - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f - SWAPGS - xorl %ebx,%ebx -1: - .if \ist - movq %gs:pda_data_offset, %rbp - .endif - .if \irqtrace - TRACE_IRQS_OFF - .endif - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - .if \ist - subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - call \sym - .if \ist - addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) - .endif - DISABLE_INTERRUPTS(CLBR_NONE) - .if \irqtrace +.macro paranoidzeroentry_ist sym do_sym ist +ENTRY(\sym) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq $-1 /* ORIG_RAX: no syscall to restart */ + CFI_ADJUST_CFA_OFFSET 8 + subq $15*8, %rsp + call save_paranoid TRACE_IRQS_OFF - .endif - .endm + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + movq %gs:pda_data_offset, %rbp + subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + call \do_sym + addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + jmp paranoid_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - * - * "trace" is 0 for the NMI handler only, because irq-tracing - * is fundamentally NMI-unsafe. (we cannot change the soft and - * hard flags at once, atomically) - */ - .macro paranoidexit trace=1 - /* ebx: no swapgs flag */ -paranoid_exit\trace: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore\trace - testl $3,CS(%rsp) - jnz paranoid_userspace\trace -paranoid_swapgs\trace: - .if \trace - TRACE_IRQS_IRETQ 0 - .endif - SWAPGS_UNSAFE_STACK -paranoid_restore\trace: - RESTORE_ALL 8 - jmp irq_return -paranoid_userspace\trace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs\trace - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule\trace - movl %ebx,%edx /* arg3: thread flags */ - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace -paranoid_schedule\trace: - .if \trace - TRACE_IRQS_ON - .endif - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - .if \trace - TRACE_IRQS_OFF - .endif - jmp paranoid_userspace\trace +.macro errorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call error_entry + DEFAULT_FRAME 0 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ CFI_ENDPROC - .endm +END(\sym) +.endm -/* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -KPROBE_ENTRY(error_entry) - _frame RDI - CFI_REL_OFFSET rax,0 - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - CFI_REGISTER rax,rsi - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 - xorl %ebx,%ebx - testl $3,CS(%rsp) - je error_kernelspace -error_swapgs: - SWAPGS -error_sti: - TRACE_IRQS_OFF - movq %rdi,RDI(%rsp) - CFI_REL_OFFSET rdi,RDI - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi /* get error code */ - movq $-1,ORIG_RAX(%rsp) - call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: - movl %ebx,%eax - RESTORE_REST - DISABLE_INTERRUPTS(CLBR_NONE) + /* error code is on the stack already */ +.macro paranoiderrorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + subq $15*8,%rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - testl %eax,%eax - jne retint_kernel - LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_swapgs + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC +END(\sym) +.endm -error_kernelspace: - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq irq_return(%rip),%rcx - cmpq %rcx,RIP(%rsp) - je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -KPROBE_END(error_entry) - - /* Reload gs selector with exception handling */ - /* edi: new selector */ +zeroentry divide_error do_divide_error +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +paranoiderrorentry double_fault do_double_fault +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +zeroentry spurious_interrupt_bug do_spurious_interrupt_bug +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error + + /* Reload gs selector with exception handling */ + /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) - SWAPGS -gs_change: - movl %edi,%gs + SWAPGS +gs_change: + movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf + popf CFI_ADJUST_CFA_OFFSET -8 - ret + ret CFI_ENDPROC -ENDPROC(native_load_gs_index) - - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" +END(native_load_gs_index) + + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" /* running with kernelgs */ -bad_gs: +bad_gs: SWAPGS /* switch back to user gs */ xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous - + movl %eax,%gs + jmp 2b + .previous + /* * Create a kernel thread. * @@ -1138,7 +1184,7 @@ ENTRY(kernel_thread) xorl %r8d,%r8d xorl %r9d,%r9d - + # clone now call do_fork movq %rax,RAX(%rsp) @@ -1149,15 +1195,15 @@ ENTRY(kernel_thread) * so internally to the x86_64 port you can rely on kernel_thread() * not to reschedule the child before returning, this avoids the need * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] + * [Hopefully no generic code relies on the reschedule -AK] */ RESTORE_ALL UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_thread) - -child_rip: +END(kernel_thread) + +ENTRY(child_rip) pushq $0 # fake return address CFI_STARTPROC /* @@ -1170,8 +1216,9 @@ child_rip: # exit mov %eax, %edi call do_exit + ud2 # padding for call trace CFI_ENDPROC -ENDPROC(child_rip) +END(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -1191,10 +1238,10 @@ ENDPROC(child_rip) ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 - SAVE_ALL + SAVE_ALL movq %rsp,%rcx call sys_execve - movq %rax, RAX(%rsp) + movq %rax, RAX(%rsp) RESTORE_REST testq %rax,%rax je int_ret_from_sys_call @@ -1202,129 +1249,7 @@ ENTRY(kernel_execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(kernel_execve) - -KPROBE_ENTRY(page_fault) - errorentry do_page_fault -KPROBE_END(page_fault) - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error -END(coprocessor_error) - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error -END(simd_coprocessor_error) - -ENTRY(device_not_available) - zeroentry do_device_not_available -END(device_not_available) - - /* runs on exception stack */ -KPROBE_ENTRY(debug) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug, DEBUG_STACK - paranoidexit -KPROBE_END(debug) - - /* runs on exception stack */ -KPROBE_ENTRY(nmi) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi, 0, 0 -#ifdef CONFIG_TRACE_IRQFLAGS - paranoidexit 0 -#else - jmp paranoid_exit1 - CFI_ENDPROC -#endif -KPROBE_END(nmi) - -KPROBE_ENTRY(int3) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit1 - CFI_ENDPROC -KPROBE_END(int3) - -ENTRY(overflow) - zeroentry do_overflow -END(overflow) - -ENTRY(bounds) - zeroentry do_bounds -END(bounds) - -ENTRY(invalid_op) - zeroentry do_invalid_op -END(invalid_op) - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun -END(coprocessor_segment_overrun) - - /* runs on exception stack */ -ENTRY(double_fault) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_double_fault - jmp paranoid_exit1 - CFI_ENDPROC -END(double_fault) - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS -END(invalid_TSS) - -ENTRY(segment_not_present) - errorentry do_segment_not_present -END(segment_not_present) - - /* runs on exception stack */ -ENTRY(stack_segment) - XCPT_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - paranoidentry do_stack_segment - jmp paranoid_exit1 - CFI_ENDPROC -END(stack_segment) - -KPROBE_ENTRY(general_protection) - errorentry do_general_protection -KPROBE_END(general_protection) - -ENTRY(alignment_check) - errorentry do_alignment_check -END(alignment_check) - -ENTRY(divide_error) - zeroentry do_divide_error -END(divide_error) - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug -END(spurious_interrupt_bug) - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - INTR_FRAME - PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit1 - CFI_ENDPROC -END(machine_check) -#endif +END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) @@ -1344,40 +1269,33 @@ ENTRY(call_softirq) decl %gs:pda_irqcount ret CFI_ENDPROC -ENDPROC(call_softirq) - -KPROBE_ENTRY(ignore_sysret) - CFI_STARTPROC - mov $-ENOSYS,%eax - sysret - CFI_ENDPROC -ENDPROC(ignore_sysret) +END(call_softirq) #ifdef CONFIG_XEN -ENTRY(xen_hypervisor_callback) - zeroentry xen_do_hypervisor_callback -END(xen_hypervisor_callback) +zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -*/ + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) CFI_STARTPROC -/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will - see the correct pointer to the pt_regs */ +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ movq %rdi, %rsp # we don't return, adjust the stack frame CFI_ENDPROC - CFI_DEFAULT_STACK + DEFAULT_FRAME 11: incl %gs:pda_irqcount movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1392,23 +1310,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) END(do_hypervisor_callback) /* -# Hypervisor uses this for application faults while it executes. -# We get here for two reasons: -# 1. Fault while reloading DS, ES, FS or GS -# 2. Fault while executing IRET -# Category 1 we do not need to fix up as Xen has already reloaded all segment -# registers that could be reloaded and zeroed the others. -# Category 2 we fix up by killing the current process. We cannot use the -# normal Linux return path in this case because if we use the IRET hypercall -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -# We distinguish between categories by comparing each saved segment register -# with its current contents: any discrepancy means we in category 1. -*/ + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ ENTRY(xen_failsafe_callback) - framesz = (RIP-0x30) /* workaround buggy gas */ - _frame framesz - CFI_REL_OFFSET rcx, 0 - CFI_REL_OFFSET r11, 8 + INTR_FRAME 1 (6*8) + /*CFI_REL_OFFSET gs,GS*/ + /*CFI_REL_OFFSET fs,FS*/ + /*CFI_REL_OFFSET es,ES*/ + /*CFI_REL_OFFSET ds,DS*/ + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 movw %ds,%cx cmpw %cx,0x10(%rsp) CFI_REMEMBER_STATE @@ -1429,12 +1350,9 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - pushq %rcx - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $0 /* RIP */ + pushq_cfi %r11 + pushq_cfi %rcx jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ @@ -1444,11 +1362,223 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $0 SAVE_ALL jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) #endif /* CONFIG_XEN */ + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check do_machine_check +#endif + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + INTR_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK +paranoid_restore: + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + CFI_ENDPROC + +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%ecx /* zero extend */ + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti +END(error_entry) + + +/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + DEFAULT_FRAME + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_swapgs + CFI_ENDPROC +END(error_exit) + + + /* runs on exception stack */ +ENTRY(nmi) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + pushq_cfi $-1 + subq $15*8, %rsp + CFI_ADJUST_CFA_OFFSET 15*8 + call save_paranoid + DEFAULT_FRAME 0 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi +#ifdef CONFIG_TRACE_IRQFLAGS + /* paranoidexit; without TRACE_IRQS_OFF */ + /* ebx: no swapgs flag */ + DISABLE_INTERRUPTS(CLBR_NONE) + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore + testl $3,CS(%rsp) + jnz nmi_userspace +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + RESTORE_ALL 8 + jmp irq_return +nmi_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz nmi_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz nmi_schedule + movl %ebx,%edx /* arg3: thread flags */ + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + jmp nmi_userspace +nmi_schedule: + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + jmp nmi_userspace + CFI_ENDPROC +#else + jmp paranoid_exit + CFI_ENDPROC +#endif +END(nmi) + +ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + sysret + CFI_ENDPROC +END(ignore_sysret) + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 0aa2c443d60..53699c931ad 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,8 +38,11 @@ #include <asm/io.h> #include <asm/nmi.h> #include <asm/smp.h> +#include <asm/atomic.h> #include <asm/apicdef.h> #include <mach_mpparse.h> +#include <asm/genapic.h> +#include <asm/setup.h> /* * ES7000 chipsets @@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; +} + +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + +static int __init es7000_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + + return 0; +} + void __init setup_unisys(void) { @@ -176,6 +216,8 @@ setup_unisys(void) else es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; + + x86_quirks->update_genapic = es7000_update_genapic; } /* @@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg) return status; } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - void __init es7000_sw_apic(void) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9b..1b43086b097 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,14 +14,17 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/percpu.h> +#include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> #include <asm/ftrace.h> +#include <linux/ftrace.h> #include <asm/nops.h> +#include <asm/nmi.h> -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,18 +34,12 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -int +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put the instruction pointer into the IP buffer + * and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status; /* holds return value of text write */ +static int mod_code_write; /* set when NMI should do the write */ +static void *mod_code_ip; /* holds the IP to write to */ +static void *mod_code_newcode; /* holds the text to write to the IP */ + +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ + int r; + + r = snprintf(buf, size, "%u %u", + nmi_wait_count, + atomic_read(&nmi_update_count)); + return r; +} + +static void ftrace_mod_code(void) +{ + /* + * Yes, more than one CPU process can be writing to mod_code_status. + * (and the code itself) + * But if one were to fail, then they all should, and if one were + * to succeed, then they all should. + */ + mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, + MCOUNT_INSN_SIZE); +} + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); + /* Must have in_nmi seen before reading write flag */ + smp_mb(); + if (mod_code_write) { + ftrace_mod_code(); + atomic_inc(&nmi_update_count); + } +} + +void ftrace_nmi_exit(void) +{ + /* Finish all executions before clearing in_nmi */ + smp_wmb(); + atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ + int waited = 0; + + while (atomic_read(&in_nmi)) { + waited = 1; + cpu_relax(); + } + + if (waited) + nmi_wait_count++; +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ + mod_code_ip = (void *)ip; + mod_code_newcode = new_code; + + /* The buffers need to be visible before we let NMIs write them */ + smp_wmb(); + + mod_code_write = 1; + + /* Make sure write bit is visible before we wait on NMIs */ + smp_mb(); + + wait_for_nmi(); + + /* Make sure all running NMIs have finished before we write the code */ + smp_mb(); + + ftrace_mod_code(); + + /* Make sure the write happens before clearing the bit */ + smp_wmb(); + + mod_code_write = 0; + + /* make sure NMIs see the cleared bit */ + smp_mb(); + + wait_for_nmi(); + + return mod_code_status; +} + + + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + +static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { @@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return -EINVAL; /* replace the text with the new text */ - if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + if (do_ftrace_mod_code(ip, new_code)) return -EPERM; sync_core(); @@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, return 0; } +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + return ftrace_modify_code(rec->ip, old, new); +} + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, + int old_offset, int new_offset) +{ + unsigned char code[MCOUNT_INSN_SIZE]; + + if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) + return -EINVAL; + + *(int *)(&code[1]) = new_offset; + + if (do_ftrace_mod_code(ip, &code)) + return -EPERM; + + return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + int old_offset, new_offset; + + old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + + return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func, int *depth) +{ + int index; + + if (!current->ret_stack) + return -EBUSY; + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = time; + *depth = index; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; + barrier(); + current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + pop_return_trace(&trace, &ret); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_graph_return(&trace); + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (unlikely(atomic_read(&in_nmi))) + return; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " _ASM_MOV " (%[parent_old]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(2b, 3b) + + : [parent_replaced] "=r" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + if (unlikely(!__kernel_text_address(old))) { + ftrace_graph_stop(); + *parent = old; + WARN_ON(1); + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, + self_addr, &trace.depth) == -EBUSY) { + *parent = old; + return; + } + + trace.func = self_addr; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + current->curr_ret_stack--; + *parent = old; + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e9..2bced78b0b8 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@ #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> +#include <asm/setup.h> extern struct genapic apic_flat; extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void) genapic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index c0262791bda..34185488e4f 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static cpumask_t flat_target_cpus(void) +static const struct cpumask *flat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t flat_vector_allocation_domain(int cpu) +static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } /* @@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void) apic_write(APIC_LDR, val); } -static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +static inline void _flat_send_IPI_mask(unsigned long mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; unsigned long flags; local_irq_save(flags); @@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) local_irq_restore(flags); } +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); + _flat_send_IPI_mask(mask, vector); +} + static void flat_send_IPI_allbutself(int vector) { + int cpu = smp_processor_id(); #ifdef CONFIG_HOTPLUG_CPU int hotplug = 1; #else int hotplug = 0; #endif if (hotplug || vector == NMI_VECTOR) { - cpumask_t allbutme = cpu_online_map; + if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { + unsigned long mask = cpumask_bits(cpu_online_mask)[0]; - cpu_clear(smp_processor_id(), allbutme); + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); + _flat_send_IPI_mask(mask, vector); + } } else if (num_online_cpus() > 1) { __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); } @@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector) static void flat_send_IPI_all(int vector) { if (vector == NMI_VECTOR) - flat_send_IPI_mask(cpu_online_map, vector); + flat_send_IPI_mask(cpu_online_mask, vector); else __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } @@ -135,9 +155,18 @@ static int flat_apic_id_registered(void) return physid_isset(read_xapic_id(), phys_cpu_present_map); } -static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) { - return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS; + + return mask1 & mask2; } static unsigned int phys_pkg_id(int index_msb) @@ -157,8 +186,10 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t physflat_target_cpus(void) +static const struct cpumask *physflat_target_cpus(void) { - return cpu_online_map; + return cpu_online_mask; } -static cpumask_t physflat_vector_allocation_domain(int cpu) +static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) { - return cpumask_of_cpu(cpu); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } -static void physflat_send_IPI_mask(cpumask_t cpumask, int vector) +static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { send_IPI_mask_sequence(cpumask, vector); } -static void physflat_send_IPI_allbutself(int vector) +static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) { - cpumask_t allbutme = cpu_online_map; + send_IPI_mask_allbutself(cpumask, vector); +} - cpu_clear(smp_processor_id(), allbutme); - physflat_send_IPI_mask(allbutme, vector); +static void physflat_send_IPI_allbutself(int vector) +{ + send_IPI_mask_allbutself(cpu_online_mask, vector); } static void physflat_send_IPI_all(int vector) { - physflat_send_IPI_mask(cpu_online_map, vector); + physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int +physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + struct genapic apic_physflat = { .name = "physical flat", .acpi_madt_oem_check = physflat_acpi_madt_oem_check, @@ -243,8 +296,10 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index f6a2c8eb48a..6ce497cc372 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } /* * for now each logical cpu is in its own vector allocation domain. */ -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, * at once. We have 16 cpu's in a cluster. This will minimize IPI register * writes. */ -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, APIC_DEST_LOGICAL); - } + for_each_cpu(query_cpu, mask) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - cpu_clear(smp_processor_id(), mask); + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. + * We're using fixed IRQ delivery, can only return one logical APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_logical_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one logical APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_logical_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index d042211768b..21bcc0e098b 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t x2apic_target_cpus(void) +static const struct cpumask *x2apic_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t x2apic_vector_allocation_domain(int cpu) +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, @@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, x2apic_icr_write(cfg, apicid); } -static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned long flags; unsigned long query_cpu; local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { + for_each_cpu(query_cpu, mask) { __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } -static void x2apic_send_IPI_allbutself(int vector) +static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) { - cpumask_t mask = cpu_online_map; + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} - cpu_clear(smp_processor_id(), mask); +static void x2apic_send_IPI_allbutself(int vector) +{ + unsigned long flags; + unsigned long query_cpu; + unsigned long this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - x2apic_send_IPI_mask(mask, vector); + local_irq_save(flags); + for_each_online_cpu(query_cpu) + if (query_cpu != this_cpu) + __x2apic_send_IPI_dest( + per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + local_irq_restore(flags); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_map, vector); + x2apic_send_IPI_mask(cpu_online_mask, vector); } static int x2apic_apic_id_registered(void) @@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void) return 1; } -static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + cpu = cpumask_first(cpumask); + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -123,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb) return current_cpu_data.initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) +static void x2apic_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -void init_x2apic_ldr(void) +static void init_x2apic_ldr(void) { return; } @@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = { .send_IPI_all = x2apic_send_IPI_all, .send_IPI_allbutself = x2apic_send_IPI_allbutself, .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2c7dbdb9827..b193e082f6c 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/threads.h> +#include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/string.h> #include <linux/ctype.h> @@ -17,6 +18,9 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/hardirq.h> +#include <linux/timer.h> +#include <linux/proc_fs.h> +#include <asm/current.h> #include <asm/smp.h> #include <asm/ipi.h> #include <asm/genapic.h> @@ -75,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ -static cpumask_t uv_target_cpus(void) +static const struct cpumask *uv_target_cpus(void) { - return cpumask_of_cpu(0); + return cpumask_of(0); } -static cpumask_t uv_vector_allocation_domain(int cpu) +static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip) @@ -123,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector) uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } -static void uv_send_IPI_mask(cpumask_t mask, int vector) +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) { unsigned int cpu; - for_each_possible_cpu(cpu) - if (cpu_isset(cpu, mask)) + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); + + for_each_cpu(cpu, mask) + if (cpu != this_cpu) uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); + unsigned int cpu; + unsigned int this_cpu = smp_processor_id(); - if (!cpus_empty(mask)) - uv_send_IPI_mask(mask, vector); + for_each_online_cpu(cpu) + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); } static void uv_send_IPI_all(int vector) { - uv_send_IPI_mask(cpu_online_map, vector); + uv_send_IPI_mask(cpu_online_mask, vector); } static int uv_apic_id_registered(void) @@ -156,7 +168,7 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) +static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) { int cpu; @@ -164,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * We're using fixed IRQ delivery, can only return one phys APIC ID. * May as well be the first. */ - cpu = first_cpu(cpumask); + cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } +static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + if (cpu < nr_cpu_ids) + return per_cpu(x86_cpu_to_apicid, cpu); + return BAD_APICID; +} + static unsigned int get_apic_id(unsigned long x) { unsigned int id; @@ -218,8 +247,10 @@ struct genapic apic_x2apic_uv_x = { .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, @@ -356,6 +387,103 @@ static __init void uv_rtc_init(void) } /* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void __cpuinit uv_heartbeat_enable(int cpu) +{ + if (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + } + + /* check boot cpu */ + if (!uv_cpu_hub_info(0)->scir.enabled) + uv_heartbeat_enable(0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __cpuinit uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +/* + * cpu hotplug notifier + */ +static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* * Called on each cpu to initialize the per_cpu UV data area. * ZZZ hotplug not supported yet */ @@ -428,7 +556,7 @@ void __init uv_system_init(void) uv_bios_init(); uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &uv_coherency_id, &uv_region_size); + &sn_coherency_id, &sn_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -439,8 +567,7 @@ void __init uv_system_init(void) uv_blade_info[blade].nr_possible_cpus++; uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; - uv_cpu_hub_info(cpu)->lowmem_remap_top = - lowmem_redir_base + lowmem_redir_size; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; uv_cpu_hub_info(cpu)->n_val = m_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; @@ -450,7 +577,8 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); @@ -467,4 +595,6 @@ void __init uv_system_init(void) map_mmioh_high(max_pnode); uv_cpu_init(); + uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); } diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 1dcb0f13897..3e66bd364a9 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,7 +35,6 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); - printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index fa1d25dd83e..ac108d1fe18 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -12,9 +12,12 @@ #include <asm/sections.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/trampoline.h> void __init i386_start_kernel(void) { + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index d16084f9064..b9a4d8c4b93 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,9 +24,10 @@ #include <asm/kdebug.h> #include <asm/e820.h> #include <asm/bios_ebda.h> +#include <asm/trampoline.h> /* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda __read_mostly; +static struct x8664_pda _boot_cpu_pda; #ifdef CONFIG_SMP /* @@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); + reserve_trampoline_memory(); + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 067d8de913f..cd759ad9069 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,7 +33,9 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; -unsigned long hpet_num_timers; +#ifdef CONFIG_PCI_MSI +static unsigned long hpet_num_timers; +#endif static void __iomem *hpet_virt_address; struct hpet_dev { @@ -246,7 +248,7 @@ static void hpet_legacy_clockevent_register(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); @@ -301,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode, struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); hpet_setup_msi_irq(hdev->irq); disable_irq(hdev->irq); - irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); enable_irq(hdev->irq); } break; @@ -449,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev) return -1; disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", @@ -500,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) /* 5 usec minimum reprogramming delta. */ evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of_cpu(hdev->cpu); + evt->cpumask = cpumask_of(hdev->cpu); clockevents_register_device(evt); } @@ -811,7 +813,7 @@ int __init hpet_enable(void) out_nohpet: hpet_clear_mapping(); - boot_hpet_disable = 1; + hpet_address = 0; return 0; } @@ -834,10 +836,11 @@ static __init int hpet_late_init(void) hpet_address = force_hpet_address; hpet_enable(); - if (!hpet_virt_address) - return -ENODEV; } + if (!hpet_virt_address) + return -ENODEV; + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); for_each_online_cpu(cpu) { diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index c1b5e3ece1f..10f92fb532f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -114,7 +114,7 @@ void __init setup_pit_timer(void) * Start pit with the boot cpu mask and make it global after the * IO_APIC has been initialized. */ - pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + pit_clockevent.cpumask = cpumask_of(smp_processor_id()); pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_clockevent.shift); pit_clockevent.max_delta_ns = diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index a4f93b4120c..df3bf269bea 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -10,11 +10,9 @@ #include <asm/pgtable.h> #include <asm/desc.h> -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */ /* * Initial thread structure. diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210f..3639442aa7a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -108,93 +108,276 @@ static int __init parse_noapic(char *str) early_param("noapic", parse_noapic); struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ + struct irq_pin_list *pin; + int node; + + node = cpu_to_node(cpu); + + pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node); + + return pin; +} + struct irq_cfg { - unsigned int irq; struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; + cpumask_var_t domain; + cpumask_var_t old_domain; unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + u8 move_desc_pending : 1; +#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else static struct irq_cfg irq_cfgx[NR_IRQS] = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +#endif + [0] = { .vector = IRQ0_VECTOR, }, + [1] = { .vector = IRQ1_VECTOR, }, + [2] = { .vector = IRQ2_VECTOR, }, + [3] = { .vector = IRQ3_VECTOR, }, + [4] = { .vector = IRQ4_VECTOR, }, + [5] = { .vector = IRQ5_VECTOR, }, + [6] = { .vector = IRQ6_VECTOR, }, + [7] = { .vector = IRQ7_VECTOR, }, + [8] = { .vector = IRQ8_VECTOR, }, + [9] = { .vector = IRQ9_VECTOR, }, + [10] = { .vector = IRQ10_VECTOR, }, + [11] = { .vector = IRQ11_VECTOR, }, + [12] = { .vector = IRQ12_VECTOR, }, + [13] = { .vector = IRQ13_VECTOR, }, + [14] = { .vector = IRQ14_VECTOR, }, + [15] = { .vector = IRQ15_VECTOR, }, }; -#define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) +int __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + int count; + int i; + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + for (i = 0; i < count; i++) { + desc = irq_to_desc(i); + desc->chip_data = &cfg[i]; + alloc_bootmem_cpumask_var(&cfg[i].domain); + alloc_bootmem_cpumask_var(&cfg[i].old_domain); + if (i < NR_IRQS_LEGACY) + cpumask_setall(cfg[i].domain); + } + + return 0; +} + +#ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; + struct irq_cfg *cfg = NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + if (desc) + cfg = desc->chip_data; + + return cfg; } -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +static struct irq_cfg *get_one_free_irq_cfg(int cpu) { - return irq_cfg(irq); + struct irq_cfg *cfg; + int node; + + node = cpu_to_node(cpu); + + cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + if (cfg) { + if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + kfree(cfg); + cfg = NULL; + } else if (!alloc_cpumask_var_node(&cfg->old_domain, + GFP_ATOMIC, node)) { + free_cpumask_var(cfg->domain); + kfree(cfg); + cfg = NULL; + } else { + cpumask_clear(cfg->domain); + cpumask_clear(cfg->old_domain); + } + } + printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); + + return cfg; } -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int arch_init_chip_data(struct irq_desc *desc, int cpu) +{ + struct irq_cfg *cfg; -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ + cfg = desc->chip_data; + if (!cfg) { + desc->chip_data = get_one_free_irq_cfg(cpu); + if (!desc->chip_data) { + printk(KERN_ERR "can not alloc irq_cfg\n"); + BUG_ON(1); + } + } -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; + return 0; +} -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC -static void __init irq_2_pin_init(void) +static void +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) { - struct irq_pin_list *pin = irq_2_pin_head; - int i; + struct irq_pin_list *old_entry, *head, *tail, *entry; + + cfg->irq_2_pin = NULL; + old_entry = old_cfg->irq_2_pin; + if (!old_entry) + return; + + entry = get_one_free_irq_2_pin(cpu); + if (!entry) + return; + + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + head = entry; + tail = entry; + old_entry = old_entry->next; + while (old_entry) { + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + entry = head; + while (entry) { + head = entry->next; + kfree(entry); + entry = head; + } + /* still use the old one */ + return; + } + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + tail->next = entry; + tail = entry; + old_entry = old_entry->next; + } - for (i = 1; i < PIN_MAP_SIZE; i++) - pin[i-1].next = &pin[i]; + tail->next = NULL; + cfg->irq_2_pin = head; +} + +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry, *next; + + if (old_cfg->irq_2_pin == cfg->irq_2_pin) + return; - irq_2_pin_ptr = &pin[0]; + entry = old_cfg->irq_2_pin; + + while (entry) { + next = entry->next; + kfree(entry); + entry = next; + } + old_cfg->irq_2_pin = NULL; } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) { - struct irq_pin_list *pin = irq_2_pin_ptr; + struct irq_cfg *cfg; + struct irq_cfg *old_cfg; - if (!pin) - panic("can not get more irq_2_pin\n"); + cfg = get_one_free_irq_cfg(cpu); - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; + if (!cfg) + return; + + desc->chip_data = cfg; + + old_cfg = old_desc->chip_data; + + memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + + init_copy_irq_2_pin(old_cfg, cfg, cpu); +} + +static void free_irq_cfg(struct irq_cfg *old_cfg) +{ + kfree(old_cfg); +} + +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +{ + struct irq_cfg *old_cfg, *cfg; + + old_cfg = old_desc->chip_data; + cfg = desc->chip_data; + + if (old_cfg == cfg) + return; + + if (old_cfg) { + free_irq_2_pin(old_cfg, cfg); + free_irq_cfg(old_cfg); + old_desc->chip_data = NULL; + } +} + +static void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->move_in_progress) { + /* it means that domain is not changed */ + if (!cpumask_intersects(&desc->affinity, mask)) + cfg->move_desc_pending = 1; + } } +#endif + +#else +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; +} + +#endif + +#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC +static inline void +set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) +{ +} +#endif struct io_apic { unsigned int index; @@ -237,11 +420,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) { struct irq_pin_list *entry; unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); entry = cfg->irq_2_pin; @@ -323,13 +505,32 @@ static void ioapic_mask_entry(int apic, int pin) } #ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +static void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + cfg->move_cleanup_count = 0; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + cfg->move_cleanup_count++; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + cfg->move_cleanup_count = cpumask_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; + u8 vector = cfg->vector; - cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { unsigned int reg; @@ -359,36 +560,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) } } -static int assign_irq_vector(int irq, cpumask_t mask); +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +/* + * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid + * of that, or returns BAD_APICID and leaves desc->affinity untouched. + */ +static unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +{ + struct irq_cfg *cfg; + unsigned int irq; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return BAD_APICID; + + irq = desc->irq; + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) + return BAD_APICID; + + cpumask_and(&desc->affinity, cfg->domain, mask); + set_extra_move_desc(desc, mask); + return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); +} + +static void +set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; unsigned long flags; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; + irq = desc->irq; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; + spin_lock_irqsave(&ioapic_lock, flags); + dest = set_desc_affinity(desc, mask); + if (dest != BAD_APICID) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, cfg); + } + spin_unlock_irqrestore(&ioapic_lock, flags); +} - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); +static void +set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc; desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); + + set_ioapic_affinity_irq_desc(desc, mask); } #endif /* CONFIG_SMP */ @@ -397,16 +623,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin) { - struct irq_cfg *cfg; struct irq_pin_list *entry; - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(); + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", + apic, pin); + return; + } cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; @@ -421,7 +649,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(); + entry->next = get_one_free_irq_2_pin(cpu); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -430,11 +658,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_cfg *cfg = irq_cfg(irq); struct irq_pin_list *entry = cfg->irq_2_pin; int replaced = 0; @@ -451,18 +678,16 @@ static void __init replace_pin_at_irq(unsigned int irq, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq(irq, newapic, newpin); + add_pin_to_irq_cpu(cfg, cpu, newapic, newpin); } -static inline void io_apic_modify_irq(unsigned int irq, +static inline void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { int pin; - struct irq_cfg *cfg; struct irq_pin_list *entry; - cfg = irq_cfg(irq); for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; pin = entry->pin; @@ -475,13 +700,13 @@ static inline void io_apic_modify_irq(unsigned int irq, } } -static void __unmask_IO_APIC_irq(unsigned int irq) +static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } #ifdef CONFIG_X86_64 -void io_apic_sync(struct irq_pin_list *entry) +static void io_apic_sync(struct irq_pin_list *entry) { /* * Synchronize the IO-APIC and the CPU by doing @@ -492,47 +717,64 @@ void io_apic_sync(struct irq_pin_list *entry) readl(&io_apic->data); } -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } #else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); } -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, IO_APIC_REDIR_MASKED, NULL); } -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) { - io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } #endif /* CONFIG_X86_32 */ -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; + BUG_ON(!cfg); + spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); + __mask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) { + struct irq_cfg *cfg = desc->chip_data; unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } +static void mask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq_desc(desc); +} +static void unmask_IO_APIC_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unmask_IO_APIC_irq_desc(desc); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -809,7 +1051,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); */ static int EISA_ELCR(unsigned int irq) { - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1034,7 +1276,8 @@ void unlock_vector_lock(void) spin_unlock(&vector_lock); } -static int __assign_irq_vector(int irq, cpumask_t mask) +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { /* * NOTE! The local APIC isn't very good at handling @@ -1049,52 +1292,49 @@ static int __assign_irq_vector(int irq, cpumask_t mask) */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + int cpu, err; + cpumask_var_t tmp_mask; if ((cfg->move_in_progress) || cfg->move_cleanup_count) return -EBUSY; + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + old_vector = cfg->vector; if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); return 0; + } } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { int new_cpu; int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + vector_allocation_domain(cpu, tmp_mask); vector = current_vector; offset = current_offset; next: vector += 8; if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ + /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; vector = FIRST_DEVICE_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) + + if (test_bit(vector, used_vectors)) goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -1102,49 +1342,47 @@ next: current_offset = offset; if (old_vector) { cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; + cpumask_copy(cfg->old_domain, cfg->domain); } - for_each_cpu_mask_nr(new_cpu, new_mask) + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; - cfg->domain = domain; - return 0; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; } - return -ENOSPC; + free_cpumask_var(tmp_mask); + return err; } -static int assign_irq_vector(int irq, cpumask_t mask) +static int +assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); + err = __assign_irq_vector(irq, cfg, mask); spin_unlock_irqrestore(&vector_lock, flags); return err; } -static void __clear_irq_vector(int irq) +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) { - struct irq_cfg *cfg; - cpumask_t mask; int cpu, vector; - cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; - cpus_clear(cfg->domain); + cpumask_clear(cfg->domain); if (likely(!cfg->move_in_progress)) return; - cpus_and(mask, cfg->old_domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1162,10 +1400,12 @@ void __setup_vector_irq(int cpu) /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; + struct irq_desc *desc; /* Mark the inuse vectors */ - for_each_irq_cfg(irq, cfg) { - if (!cpu_isset(cpu, cfg->domain)) + for_each_irq_desc(irq, desc) { + cfg = desc->chip_data; + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; @@ -1177,7 +1417,7 @@ void __setup_vector_irq(int cpu) continue; cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) + if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1215,11 +1455,8 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) @@ -1311,23 +1548,22 @@ static int setup_ioapic_entry(int apic, int irq, return 0; } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc, int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - cpumask_t mask; + unsigned int dest; if (!IO_APIC_IRQ(irq)) return; - cfg = irq_cfg(irq); + cfg = desc->chip_data; - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) + if (assign_irq_vector(irq, cfg, TARGET_CPUS)) return; - cpus_and(mask, cfg->domain, mask); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1337,16 +1573,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { + dest, trigger, polarity, cfg->vector)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); return; } - ioapic_register_intr(irq, trigger); - if (irq < 16) + ioapic_register_intr(irq, desc, trigger); + if (irq < NR_IRQS_LEGACY) disable_8259A_irq(irq); ioapic_write_entry(apic, pin, entry); @@ -1356,6 +1591,9 @@ static void __init setup_IO_APIC_irqs(void) { int apic, pin, idx, irq; int notcon = 0; + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1387,9 +1625,15 @@ static void __init setup_IO_APIC_irqs(void) if (multi_timer_check(apic, irq)) continue; #endif - add_pin_to_irq(irq, apic, pin); + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, + setup_IO_APIC_irq(apic, pin, irq, desc, irq_trigger(idx), irq_polarity(idx)); } } @@ -1448,6 +1692,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + struct irq_desc *desc; unsigned int irq; if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1782,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(irq, cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; + for_each_irq_desc(irq, desc) { + struct irq_pin_list *entry; + + cfg = desc->chip_data; + entry = cfg->irq_2_pin; if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); @@ -2022,14 +2270,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; + struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { + if (irq < NR_IRQS_LEGACY) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; } - __unmask_IO_APIC_irq(irq); + cfg = irq_cfg(irq); + __unmask_IO_APIC_irq(cfg); spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; @@ -2043,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq) unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; @@ -2092,35 +2342,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); * as simple as edge triggered migration and we can do the irq migration * with a simple atomic update to IO-APIC RTE. */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) +static void +migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; struct irte irte; int modify_ioapic_rte; unsigned int dest; unsigned long flags; + unsigned int irq; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + if (!cpumask_intersects(mask, cpu_online_mask)) return; + irq = desc->irq; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + cfg = desc->chip_data; + if (assign_irq_vector(irq, cfg, mask)) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + set_extra_move_desc(desc, mask); + + dest = cpu_mask_to_apicid_and(cfg->domain, mask); - desc = irq_to_desc(irq); modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); + __target_IO_APIC_irq(irq, dest, cfg); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2132,24 +2382,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) */ modify_irte(irq, &irte); - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (cfg->move_in_progress) + send_cleanup_vector(cfg); - desc->affinity = mask; + cpumask_copy(&desc->affinity, mask); } -static int migrate_irq_remapped_level(int irq) +static int migrate_irq_remapped_level_desc(struct irq_desc *desc) { int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); - if (io_apic_level_ack_pending(irq)) { + if (io_apic_level_ack_pending(cfg)) { /* * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse @@ -2161,14 +2407,15 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); + migrate_ioapic_irq_desc(desc, &desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); + cpumask_clear(&desc->pending_mask); unmask: - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); + return ret; } @@ -2189,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, desc->pending_mask); + desc->chip->set_affinity(irq, &desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -2198,28 +2445,33 @@ static void ir_irq_migration(struct work_struct *work) /* * Migrates the IRQ destination in the process context. */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, + const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); + cpumask_copy(&desc->pending_mask, mask); + migrate_irq_remapped_level_desc(desc); return; } - migrate_ioapic_irq(irq, mask); + migrate_ioapic_irq_desc(desc, mask); +} +static void set_ir_ioapic_affinity_irq(unsigned int irq, + const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + set_ir_ioapic_affinity_irq_desc(desc, mask); } #endif asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; + ack_APIC_irq(); -#ifdef CONFIG_X86_64 exit_idle(); -#endif irq_enter(); me = smp_processor_id(); @@ -2229,6 +2481,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; + if (irq == -1) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -2238,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (!cfg->move_cleanup_count) goto unlock; - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; __get_cpu_var(vector_irq)[vector] = -1; @@ -2250,28 +2505,44 @@ unlock: irq_exit(); } -static void irq_complete_move(unsigned int irq) +static void irq_complete_move(struct irq_desc **descp) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_desc *desc = *descp; + struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) + if (likely(!cfg->move_in_progress)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + if (likely(!cfg->move_desc_pending)) + return; + + /* domain has not changed, but affinity did */ + me = smp_processor_id(); + if (cpu_isset(me, desc->affinity)) { + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; + cfg->move_desc_pending = 0; + } +#endif return; + } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; +#endif - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); } #else -static inline void irq_complete_move(unsigned int irq) {} +static inline void irq_complete_move(struct irq_desc **descp) {} #endif + #ifdef CONFIG_INTR_REMAP static void ack_x2apic_level(unsigned int irq) { @@ -2282,11 +2553,14 @@ static void ack_x2apic_edge(unsigned int irq) { ack_x2APIC_irq(); } + #endif static void ack_apic_edge(unsigned int irq) { - irq_complete_move(irq); + struct irq_desc *desc = irq_to_desc(irq); + + irq_complete_move(&desc); move_native_irq(irq); ack_APIC_irq(); } @@ -2295,18 +2569,21 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { + struct irq_desc *desc = irq_to_desc(irq); + #ifdef CONFIG_X86_32 unsigned long v; int i; #endif + struct irq_cfg *cfg; int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(&desc); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; - mask_IO_APIC_irq(irq); + mask_IO_APIC_irq_desc(desc); } #endif @@ -2330,7 +2607,8 @@ static void ack_apic_level(unsigned int irq) * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_cfg(irq)->vector; + cfg = desc->chip_data; + i = cfg->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); #endif @@ -2369,17 +2647,18 @@ static void ack_apic_level(unsigned int irq) * accurate and is causing problems then it is a hardware bug * and you can go talk to the chipset vendor about it. */ - if (!io_apic_level_ack_pending(irq)) + cfg = desc->chip_data; + if (!io_apic_level_ack_pending(cfg)) move_masked_irq(irq); - unmask_IO_APIC_irq(irq); + unmask_IO_APIC_irq_desc(desc); } #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } #endif @@ -2430,20 +2709,19 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(irq, cfg) { - if (IO_APIC_IRQ(irq) && !cfg->vector) { + for_each_irq_desc(irq, desc) { + cfg = desc->chip_data; + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < 16) + if (irq < NR_IRQS_LEGACY) make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); + else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; - } } } } @@ -2468,7 +2746,7 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void ack_lapic_irq(unsigned int irq) { ack_APIC_irq(); } @@ -2480,11 +2758,8 @@ static struct irq_chip lapic_chip __read_mostly = { .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq) +static void lapic_register_intr(int irq, struct irq_desc *desc) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); @@ -2588,7 +2863,9 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg(0); + struct irq_desc *desc = irq_to_desc(0); + struct irq_cfg *cfg = desc->chip_data; + int cpu = boot_cpu_id; int apic1, pin1, apic2, pin2; unsigned long flags; unsigned int ver; @@ -2603,7 +2880,7 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); + assign_irq_vector(0, cfg, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2654,10 +2931,10 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); + add_pin_to_irq_cpu(cfg, cpu, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); @@ -2683,9 +2960,9 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); + unmask_IO_APIC_irq_desc(desc); enable_8259A_irq(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2717,7 +2994,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0); + lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2902,22 +3179,26 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int irq; unsigned int new; unsigned long flags; - struct irq_cfg *cfg_new; - - irq_want = nr_irqs - 1; + struct irq_cfg *cfg_new = NULL; + int cpu = boot_cpu_id; + struct irq_desc *desc_new = NULL; irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { + for (new = irq_want; new < NR_IRQS; new++) { if (platform_legacy_irq(new)) continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) + + desc_new = irq_to_desc_alloc_cpu(new, cpu); + if (!desc_new) { + printk(KERN_INFO "can not get irq_desc for %d\n", new); continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) + } + cfg_new = desc_new->chip_data; + + if (cfg_new->vector != 0) + continue; + if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0) irq = new; break; } @@ -2925,15 +3206,21 @@ unsigned int create_irq_nr(unsigned int irq_want) if (irq > 0) { dynamic_irq_init(irq); + /* restore it, in case dynamic_irq_init clear it */ + if (desc_new) + desc_new->chip_data = cfg_new; } return irq; } +static int nr_irqs_gsi = NR_IRQS_LEGACY; int create_irq(void) { + unsigned int irq_want; int irq; - irq = create_irq_nr(nr_irqs - 1); + irq_want = nr_irqs_gsi; + irq = create_irq_nr(irq_want); if (irq == 0) irq = -1; @@ -2944,14 +3231,22 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc; + /* store it, in case dynamic_irq_cleanup clear it */ + desc = irq_to_desc(irq); + cfg = desc->chip_data; dynamic_irq_cleanup(irq); + /* connect back irq_cfg */ + if (desc) + desc->chip_data = cfg; #ifdef CONFIG_INTR_REMAP free_irte(irq); #endif spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); + __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2964,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms struct irq_cfg *cfg; int err; unsigned dest; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (err) return err; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { @@ -3027,64 +3319,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms } #ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; + cfg = desc->chip_data; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); + read_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + write_msi_msg_desc(desc, &msg); } - #ifdef CONFIG_INTR_REMAP /* * Migrate the MSI irq to another cpumask. This migration is * done in the process context using interrupt-remapping hardware. */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +static void +ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) { - struct irq_cfg *cfg; + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; unsigned int dest; - cpumask_t tmp, cleanup_mask; struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; if (get_irte(irq, &irte)) return; - if (assign_irq_vector(irq, mask)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); @@ -3098,16 +3374,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) * at the new destination. So, time to cleanup the previous * vector allocation. */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; + if (cfg->move_in_progress) + send_cleanup_vector(cfg); } + #endif #endif /* CONFIG_SMP */ @@ -3166,7 +3436,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) } #endif -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { int ret; struct msi_msg msg; @@ -3175,7 +3445,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, desc); + set_irq_msi(irq, msidesc); write_msi_msg(irq, &msg); #ifdef CONFIG_INTR_REMAP @@ -3195,26 +3465,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) { unsigned int irq; int ret; unsigned int irq_want; - irq_want = build_irq_for_pci_dev(dev) + 0x100; - + irq_want = nr_irqs_gsi; irq = create_irq_nr(irq_want); if (irq == 0) return -1; @@ -3228,7 +3485,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) goto error; no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) { destroy_irq(irq); return ret; @@ -3246,7 +3503,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { unsigned int irq; int ret, sub_handle; - struct msi_desc *desc; + struct msi_desc *msidesc; unsigned int irq_want; #ifdef CONFIG_INTR_REMAP @@ -3254,10 +3511,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif - irq_want = build_irq_for_pci_dev(dev) + 0x100; + irq_want = nr_irqs_gsi; sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want); + irq_want++; if (irq == 0) return -1; #ifdef CONFIG_INTR_REMAP @@ -3289,7 +3547,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; sub_handle++; @@ -3308,24 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_DMAR #ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; dmar_msi_read(irq, &msg); @@ -3335,9 +3587,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip dmar_msi_type = { @@ -3369,24 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER #ifdef CONFIG_SMP -static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; - struct irq_desc *desc; struct msi_msg msg; unsigned int dest; - cpumask_t tmp; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; hpet_msi_read(irq, &msg); @@ -3396,9 +3641,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); hpet_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif /* CONFIG_SMP */ struct irq_chip hpet_msi_type = { @@ -3451,28 +3695,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) write_ht_irq_msg(irq, &msg); } -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) { + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) return; - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); + cfg = desc->chip_data; target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; } + #endif static struct irq_chip ht_irq_chip = { @@ -3490,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; int err; - cpumask_t tmp; - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, TARGET_CPUS); if (!err) { struct ht_irq_msg msg; unsigned dest; - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -3536,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_offset) { - const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3544,7 +3778,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long flags; int err; - err = assign_irq_vector(irq, *eligible_cpu); + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); if (err != 0) return err; @@ -3553,8 +3789,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, irq_name); spin_unlock_irqrestore(&vector_lock, flags); - cfg = irq_cfg(irq); - mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3565,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = cpu_mask_to_apicid(*eligible_cpu); + entry->dest = cpu_mask_to_apicid(eligible_cpu); mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -3606,9 +3840,16 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries; } -int __init probe_nr_irqs(void) +void __init probe_nr_irqs_gsi(void) { - return NR_IRQS; + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx) + 1; + + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; } /* -------------------------------------------------------------------------- @@ -3707,19 +3948,31 @@ int __init io_apic_get_version(int ioapic) int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; + if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + if (irq >= NR_IRQS_LEGACY) { + cfg = desc->chip_data; + add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); + } - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity); return 0; } @@ -3757,7 +4010,7 @@ void __init setup_ioapic_dest(void) int pin, ioapic, irq, irq_entry; struct irq_desc *desc; struct irq_cfg *cfg; - cpumask_t mask; + const struct cpumask *mask; if (skip_ioapic_setup == 1) return; @@ -3773,9 +4026,10 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - cfg = irq_cfg(irq); + desc = irq_to_desc(irq); + cfg = desc->chip_data; if (!cfg->vector) { - setup_IO_APIC_irq(ioapic, pin, irq, + setup_IO_APIC_irq(ioapic, pin, irq, desc, irq_trigger(irq_entry), irq_polarity(irq_entry)); continue; @@ -3785,19 +4039,18 @@ void __init setup_ioapic_dest(void) /* * Honour affinities which have been set in early boot */ - desc = irq_to_desc(irq); if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->affinity; + mask = &desc->affinity; else mask = TARGET_CPUS; #ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, mask); + set_ir_ioapic_affinity_irq_desc(desc, mask); else #endif - set_ioapic_affinity_irq(irq, mask); + set_ioapic_affinity_irq_desc(desc, mask); } } @@ -3846,7 +4099,6 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index f1c688e46f3..285bbf8831f 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; local_irq_save(flags); - WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); __send_IPI_dest_field(mask, vector); local_irq_restore(flags); } -void send_IPI_mask_sequence(cpumask_t mask, int vector) +void send_IPI_mask_sequence(const struct cpumask *mask, int vector) { unsigned long flags; unsigned int query_cpu; @@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for_each_possible_cpu(query_cpu) { - if (cpu_isset(query_cpu, mask)) { + for_each_cpu(query_cpu, mask) + __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); + local_irq_restore(flags); +} + +void send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + if (query_cpu != this_cpu) __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); - } - } local_irq_restore(flags); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..bce53e1352a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -9,6 +9,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/smp.h> +#include <asm/irq.h> atomic_t irq_err_count; @@ -118,6 +119,9 @@ int show_interrupts(struct seq_file *p, void *v) } desc = irq_to_desc(i); + if (!desc) + return 0; + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); @@ -187,3 +191,5 @@ u64 arch_irq_stat(void) #endif return sum; } + +EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a51382672de..9dc5588f336 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_HOTPLUG_CPU #include <mach_apic.h> -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; - cpus_and(mask, desc->affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { + affinity = &desc->affinity; + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { printk("Breaking affinity for irq %i\n", irq); - mask = map; + affinity = cpu_all_mask; } if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..6383d50f82e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -13,12 +13,12 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/delay.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/io_apic.h> #include <asm/idle.h> #include <asm/smp.h> -#ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: * @@ -28,26 +28,25 @@ */ static inline void stack_overflow_check(struct pt_regs *regs) { +#ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); - static unsigned long warned = -60*HZ; - - if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + 128 && - time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); - show_stack(NULL,NULL); - warned = jiffies; - } -} + + WARN_ONCE(regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128, + + "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); #endif +} /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; @@ -60,9 +59,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; -#ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); -#endif desc = irq_to_desc(irq); if (likely(desc)) @@ -83,40 +80,43 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) } #ifdef CONFIG_HOTPLUG_CPU -void fixup_irqs(cpumask_t map) +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) { unsigned int irq; static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { - cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + const struct cpumask *affinity; + if (!desc) + continue; if (irq == 2) continue; /* interrupt's are disabled at this point */ spin_lock(&desc->lock); + affinity = &desc->affinity; if (!irq_has_action(irq) || - cpus_equal(desc->affinity, map)) { + cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); continue; } - cpus_and(mask, desc->affinity, map); - if (cpus_empty(mask)) { + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - mask = map; + affinity = cpu_all_mask; } if (desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, mask); + desc->chip->set_affinity(irq, affinity); else if (!(warned++)) set_affinity = 0; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e8..84723295f88 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -68,8 +68,7 @@ void __init init_ISA_irqs (void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -111,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -129,7 +140,7 @@ void __init native_init_IRQ(void) for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i]); + set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } @@ -147,10 +158,12 @@ void __init native_init_IRQ(void) alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff023539128..31ebfe38e96 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -24,41 +24,6 @@ #include <asm/i8259.h> /* - * Common place to define all x86 IRQ vectors - * - * This builds up the IRQ handler stubs using some ugly macros in irq.h - * - * These macros create the low-level assembly IRQ routines that save - * register context and call do_IRQ(). do_IRQ() then does all the - * operations that are needed to keep the AT (or SMP IOAPIC) - * interrupt-controller happy. - */ - -#define IRQ_NAME2(nr) nr##_interrupt(void) -#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) - -/* - * SMP has a few special interrupts for IPI messages - */ - -#define BUILD_IRQ(nr) \ - asmlinkage void IRQ_NAME(nr); \ - asm("\n.text\n.p2align\n" \ - "IRQ" #nr "_interrupt:\n\t" \ - "push $~(" #nr ") ; " \ - "jmp common_interrupt\n" \ - ".previous"); - -#define BI(x,y) \ - BUILD_IRQ(x##y) - -#define BUILD_16_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) BI(x,f) - -/* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x30-0x3f) */ @@ -73,37 +38,6 @@ * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) -BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) -BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) - -#undef BUILD_16_IRQS -#undef BI - - -#define IRQ(x,y) \ - IRQ##x##y##_interrupt - -#define IRQLIST_16(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) - -/* for the irq vectors */ -static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { - IRQLIST_16(0x2), IRQLIST_16(0x3), - IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), - IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) -}; - -#undef IRQ -#undef IRQLIST_16 - - - /* * IRQ2 is cascade interrupt to second interrupt controller @@ -135,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] != -1) + return 1; + } + + return 0; +} + void __init init_ISA_irqs(void) { int i; @@ -142,8 +88,7 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ + for (i = 0; i < NR_IRQS_LEGACY; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -188,6 +133,7 @@ static void __init smp_intr_init(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e169ae9b6a6..652fce6d2cc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void) */ static unsigned long kvm_get_tsc_khz(void) { - return preset_lpj; + struct pvclock_vcpu_time_info *src; + src = &per_cpu(hv_clock, 0); + return pvclock_tsc_khz(src); } static void kvm_get_preset_lpj(void) { - struct pvclock_vcpu_time_info *src; unsigned long khz; u64 lpj; - src = &per_cpu(hv_clock, 0); - khz = pvclock_tsc_khz(src); + khz = kvm_get_tsc_khz(); lpj = ((u64)khz * 1000); do_div(lpj, HZ); @@ -194,5 +194,7 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index eee32b43fee..71f1d99a635 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,8 +12,8 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) if (err < 0) return err; - for(i = 0; i < old->size; i++) + for (i = 0; i < old->size; i++) write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 7a385746509..37f420018a4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,6 +13,7 @@ #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/suspend.h> +#include <linux/gfp.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -25,15 +26,6 @@ #include <asm/system.h> #include <asm/cacheflush.h> -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) -static u32 kexec_pgd[1024] PAGE_ALIGNED; -#ifdef CONFIG_X86_PAE -static u32 kexec_pmd0[1024] PAGE_ALIGNED; -static u32 kexec_pmd1[1024] PAGE_ALIGNED; -#endif -static u32 kexec_pte0[1024] PAGE_ALIGNED; -static u32 kexec_pte1[1024] PAGE_ALIGNED; - static void set_idt(void *newidt, __u16 limit) { struct desc_ptr curidt; @@ -76,6 +68,76 @@ static void load_segments(void) #undef __STR } +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = 0; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -87,12 +149,20 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Make control page executable. + * - Make control page executable. + * - Allocate page tables + * - Setup page tables */ int machine_kexec_prepare(struct kimage *image) { + int error; + if (nx_enabled) set_pages_x(image->control_code_page, 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); return 0; } @@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image) { if (nx_enabled) set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); } /* @@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image) relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_PGD] = __pa(kexec_pgd); - page_list[VA_PGD] = (unsigned long)kexec_pgd; -#ifdef CONFIG_X86_PAE - page_list[PA_PMD_0] = __pa(kexec_pmd0); - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PMD_1] = __pa(kexec_pmd1); - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; -#endif - page_list[PA_PTE_0] = __pa(kexec_pte0); - page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PTE_1] = __pa(kexec_pte1); - page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_PGD] = __pa(image->arch.pgd); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3b599518c32..c12314c9e86 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = { .set_mode = mfgpt_set_mode, .set_next_event = mfgpt_next_event, .rating = 250, - .cpumask = CPU_MASK_ALL, + .cpumask = cpu_all_mask, .shift = 32 }; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 5f8e5d75a25..c25fdb38229 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -10,7 +10,7 @@ * This driver allows to upgrade microcode on AMD * family 0x10 and 0x11 processors. * - * Licensed unter the terms of the GNU General Public + * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ @@ -32,9 +32,9 @@ #include <linux/platform_device.h> #include <linux/pci.h> #include <linux/pci_ids.h> +#include <linux/uaccess.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/processor.h> #include <asm/microcode.h> @@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 struct equiv_cpu_entry { - unsigned int installed_cpu; - unsigned int fixed_errata_mask; - unsigned int fixed_errata_compare; - unsigned int equiv_cpu; -}; + u32 installed_cpu; + u32 fixed_errata_mask; + u32 fixed_errata_compare; + u16 equiv_cpu; + u16 res; +} __attribute__((packed)); struct microcode_header_amd { - unsigned int data_code; - unsigned int patch_id; - unsigned char mc_patch_data_id[2]; - unsigned char mc_patch_data_len; - unsigned char init_flag; - unsigned int mc_patch_data_checksum; - unsigned int nb_dev_id; - unsigned int sb_dev_id; - unsigned char processor_rev_id[2]; - unsigned char nb_rev_id; - unsigned char sb_rev_id; - unsigned char bios_api_rev; - unsigned char reserved1[3]; - unsigned int match_reg[8]; -}; + u32 data_code; + u32 patch_id; + u16 mc_patch_data_id; + u8 mc_patch_data_len; + u8 init_flag; + u32 mc_patch_data_checksum; + u32 nb_dev_id; + u32 sb_dev_id; + u16 processor_rev_id; + u8 nb_rev_id; + u8 sb_rev_id; + u8 bios_api_rev; + u8 reserved1[3]; + u32 match_reg[8]; +} __attribute__((packed)); struct microcode_amd { struct microcode_header_amd hdr; unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define DWSIZE (sizeof(u32)) -/* For now we support a fixed ucode total size only */ -#define get_totalsize(mc) \ - ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ - + MC_HEADER_SIZE) +#define UCODE_MAX_SIZE 2048 +#define UCODE_CONTAINER_SECTION_HDR 8 +#define UCODE_CONTAINER_HEADER_SIZE 12 /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); @@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 dummy; memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", - cpu); + printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } - - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (csig->rev) - : "i" (0x0000008B) : "ecx"); - - printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - csig->rev); - + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); + printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } static int get_matching_microcode(int cpu, void *mc, int rev) { struct microcode_header_amd *mc_header = mc; - struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; - unsigned int equiv_cpu_id = 0x00; + u16 equiv_cpu_id = 0; unsigned int i = 0; BUG_ON(equiv_cpu_table == NULL); @@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev) } if (!equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d cpu_id " - "not found in equivalent cpu table \n", cpu); + printk(KERN_WARNING "microcode: CPU%d: cpu revision " + "not listed in equivalent cpu table\n", cpu); return 0; } - if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { - printk(KERN_ERR - "microcode: CPU%d patch does not match " - "(patch is %x, cpu extended is %x) \n", - cpu, mc_header->processor_rev_id[0], - (equiv_cpu_id & 0xff)); + if (mc_header->processor_rev_id != equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d: patch mismatch " + "(processor_rev_id: %x, equiv_cpu_id: %x)\n", + cpu, mc_header->processor_rev_id, equiv_cpu_id); return 0; } - if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { - printk(KERN_ERR "microcode: CPU%d patch does not match " - "(patch is %x, cpu base id is %x) \n", - cpu, mc_header->processor_rev_id[1], - ((equiv_cpu_id >> 16) & 0xff)); - + /* ucode might be chipset specific -- currently we don't support this */ + if (mc_header->nb_dev_id || mc_header->sb_dev_id) { + printk(KERN_ERR "microcode: CPU%d: loading of chipset " + "specific code not yet supported\n", cpu); return 0; } - /* ucode may be northbridge specific */ - if (mc_header->nb_dev_id) { - nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->nb_dev_id & 0xff), - NULL); - if ((!nb_pci_dev) || - (mc_header->nb_rev_id != nb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); - pci_dev_put(nb_pci_dev); - return 0; - } - pci_dev_put(nb_pci_dev); - } - - /* ucode may be southbridge specific */ - if (mc_header->sb_dev_id) { - sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, - (mc_header->sb_dev_id & 0xff), - NULL); - if ((!sb_pci_dev) || - (mc_header->sb_rev_id != sb_pci_dev->revision)) { - printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); - pci_dev_put(sb_pci_dev); - return 0; - } - pci_dev_put(sb_pci_dev); - } - if (mc_header->patch_id <= rev) return 0; @@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev) static void apply_microcode_amd(int cpu) { unsigned long flags; - unsigned int eax, edx; - unsigned int rev; + u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; - unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu) return; spin_lock_irqsave(µcode_update_lock, flags); - - addr = (unsigned long)&mc_amd->hdr.data_code; - edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); - eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); - - asm volatile("movl %0, %%ecx; wrmsr" : - : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); - + wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, - mc_amd->hdr.patch_id, rev); + printk(KERN_ERR "microcode: CPU%d: update failed " + "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x \n", - cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); + printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", + cpu, rev); uci->cpu_sig.rev = rev; } -static void * get_next_ucode(u8 *buf, unsigned int size, - int (*get_ucode_data)(void *, const void *, size_t), - unsigned int *mc_size) +static int get_ucode_data(void *to, const u8 *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static void *get_next_ucode(const u8 *buf, unsigned int size, + unsigned int *mc_size) { unsigned int total_size; -#define UCODE_CONTAINER_SECTION_HDR 8 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; @@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size, return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode payload type field\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_INFO "microcode: size %u, total_size %u\n", - size, total_size); + printk(KERN_DEBUG "microcode: size %u, total_size %u\n", + size, total_size); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; } mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, + total_size)) { vfree(mc); mc = NULL; } else *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_CONTAINER_SECTION_HDR return mc; } -static int install_equiv_cpu_table(u8 *buf, - int (*get_ucode_data)(void *, const void *, size_t)) +static int install_equiv_cpu_table(const u8 *buf) { -#define UCODE_CONTAINER_HEADER_SIZE 12 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; @@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf, size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table\n"); + printk(KERN_ERR "microcode: error: invalid type field in " + "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + printk(KERN_ERR "microcode: failed to allocate " + "equivalent CPU table\n"); return 0; } @@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf, } return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ -#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) @@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static int generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + const u8 *ucode_ptr = data; + void *new_mc = NULL; + void *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; - offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); + offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + printk(KERN_ERR "microcode: failed to create " + "equivalent cpu table\n"); return -EINVAL; } @@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + mc = get_next_ucode(ucode_ptr, leftover, &mc_size); if (!mc) break; @@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; - } else + } else vfree(mc); ucode_ptr += mc_size; @@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (uci->mc) vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size, return (int)leftover; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static int request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; @@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device) ret = request_firmware(&firmware, fw_name, device); if (ret) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return ret; } - ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, - &get_ucode_fw); + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); @@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device) static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" - "is not supported\n"); + printk(KERN_INFO "microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return -1; } @@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } + diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 82fb2809ce3..c9b721ba968 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -99,7 +99,7 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "2.00" -struct microcode_ops *microcode_ops; +static struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); @@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static void microcode_fini_cpu(int cpu) +static void __microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); microcode_ops->microcode_fini_cpu(cpu); uci->valid = 0; +} + +static void microcode_fini_cpu(int cpu) +{ + mutex_lock(µcode_mutex); + __microcode_fini_cpu(cpu); mutex_unlock(µcode_mutex); } @@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu) * to this cpu (a bit of paranoia): */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - microcode_fini_cpu(cpu); + __microcode_fini_cpu(cpu); + printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", + cpu); return -1; } - if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { - microcode_fini_cpu(cpu); + if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { + __microcode_fini_cpu(cpu); + printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", + cpu); /* Should we look for a new ucode here? */ return 1; } @@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu) return 0; } -void microcode_update_cpu(int cpu) +static void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 622dc4a2178..b7f4c929e61 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); + spin_unlock_irqrestore(µcode_update_lock, flags); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", csig->sig, csig->pf, csig->rev); @@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu) uci->mc = NULL; } -struct microcode_ops microcode_intel_ops = { +static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index efc2f361fe8..666e43df51f 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -13,8 +13,7 @@ #include <asm/msr.h> #include <asm/acpi.h> #include <asm/mmconfig.h> - -#include "../pci/pci.h" +#include <asm/pci_x86.h> struct pci_hostbridge_probe { u32 bus; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index f98f4e1dba0..c5c5b8df1db 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,14 +16,14 @@ #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/module.h> +#include <linux/smp.h> +#include <linux/acpi.h> -#include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> #include <asm/pgalloc.h> #include <asm/io_apic.h> #include <asm/proto.h> -#include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/e820.h> #include <asm/trampoline.h> @@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) + set_bit(m->mpc_busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) x86_quirks->mpc_oem_pci_bus(m); clear_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; @@ -586,23 +586,23 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } + if (!mpf) + return; + if (acpi_lapic && early) return; + /* - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + if (acpi_lapic && acpi_ioapic) return; - } else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); + + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) + return; + } printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 82a7c7ed6d4..726266695b2 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -136,7 +136,7 @@ static int msr_open(struct inode *inode, struct file *file) lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= NR_CPUS || !cpu_online(cpu)) { + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { ret = -ENXIO; /* No such CPU */ goto out; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2c97f07f1c2..45a09ccdc21 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -26,11 +26,10 @@ #include <linux/kernel_stat.h> #include <linux/kdebug.h> #include <linux/smp.h> +#include <linux/nmi.h> #include <asm/i8259.h> #include <asm/io_apic.h> -#include <asm/smp.h> -#include <asm/nmi.h> #include <asm/proto.h> #include <asm/timer.h> @@ -131,6 +130,11 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) atomic_dec(&nmi_active); } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -179,8 +183,12 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; error: - if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) - disable_8259A_irq(0); + if (nmi_watchdog == NMI_IO_APIC) { + if (!timer_through_8259) + disable_8259A_irq(0); + on_each_cpu(__acpi_nmi_disable, NULL, 1); + } + #ifdef CONFIG_X86_32 timer_ack = 0; #endif @@ -199,12 +207,17 @@ static int __init setup_nmi_watchdog(char *str) ++str; } - get_option(&str, &nmi); - - if (nmi >= NMI_INVALID) - return 0; + if (!strncmp(str, "lapic", 5)) + nmi_watchdog = NMI_LOCAL_APIC; + else if (!strncmp(str, "ioapic", 6)) + nmi_watchdog = NMI_IO_APIC; + else { + get_option(&str, &nmi); + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + } - nmi_watchdog = nmi; return 1; } __setup("nmi_watchdog=", setup_nmi_watchdog); @@ -285,11 +298,6 @@ void acpi_nmi_enable(void) on_each_cpu(__acpi_nmi_enable, NULL, 1); } -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - /* * Disable timer based NMIs on all CPUs: */ @@ -340,6 +348,8 @@ void stop_apic_nmi_watchdog(void *unused) return; if (nmi_watchdog == NMI_LOCAL_APIC) lapic_watchdog_stop(); + else + __acpi_nmi_disable(NULL); __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -465,6 +475,24 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static void enable_ioapic_nmi_watchdog_single(void *unused) +{ + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + __acpi_nmi_enable(NULL); +} + +static void enable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); + touch_nmi_watchdog(); +} + +static void disable_ioapic_nmi_watchdog(void) +{ + on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); +} + static int __init setup_unknown_nmi_panic(char *str) { unknown_nmi_panic = 1; @@ -507,6 +535,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + if (nmi_watchdog_enabled) + enable_ioapic_nmi_watchdog(); + else + disable_ioapic_nmi_watchdog(); } else { printk(KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e..0deea37a53c 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@ #include <asm/numaq.h> #include <asm/topology.h> #include <asm/processor.h> -#include <asm/mpspec.h> +#include <asm/genapic.h> #include <asm/e820.h> #include <asm/setup.h> @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void) return 1; } +static int __init numaq_update_genapic(void) +{ + genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + + return 0; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, .setup_ioapic_ids = numaq_setup_ioapic_ids, + .update_genapic = numaq_update_genapic, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 0e9f1982b1d..95777b0faa7 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -7,7 +7,8 @@ #include <asm/paravirt.h> -static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +static inline void +default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_lock(lock); } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 19262482021..19a1044a0cd 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -6,6 +6,7 @@ #include <asm/proto.h> #include <asm/dma.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> @@ -30,11 +31,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* This tells the BIO block layer to assume merging. Default to off - because we cannot guarantee merging later. */ -int iommu_bio_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_bio_merge); - dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#endif void __init pci_iommu_alloc(void) { +#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#endif + /* * The order of these functions is important for * fall-back/fail-over reasons @@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) -{ - unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); - - return size >> PAGE_SHIFT; -} -EXPORT_SYMBOL(iommu_nr_pages); -#endif - void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -188,7 +179,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "biomerge", 8)) { - iommu_bio_merge = 4096; iommu_merge = 1; force_iommu = 1; } @@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); + printk(KERN_INFO + "PCI: VIA PCI bridge detected. Disabling DAC.\n"); forbid_dac = 1; } } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a42b02b4df6..00c2bcd4146 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * to trigger bugs with some popular PCI cards, in particular 3ware (but * has been also also seen with Qlogic at least). */ -int iommu_fullflush = 1; +static int iommu_fullflush = 1; /* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); @@ -123,6 +123,8 @@ static void free_iommu(unsigned long offset, int size) spin_lock_irqsave(&iommu_bitmap_lock, flags); iommu_area_free(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } @@ -743,10 +745,8 @@ void __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { - printk(KERN_INFO "PCI-GART: No AMD GART found.\n"); + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) return; - } #ifndef CONFIG_AGP_AMD64 no_agp = 1; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 3c539d111ab..d59c9174766 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -3,6 +3,8 @@ #include <linux/pci.h> #include <linux/cache.h> #include <linux/module.h> +#include <linux/swiotlb.h> +#include <linux/bootmem.h> #include <linux/dma-mapping.h> #include <asm/iommu.h> @@ -11,6 +13,31 @@ int swiotlb __read_mostly; +void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) +{ + return alloc_bootmem_low_pages(size); +} + +void *swiotlb_alloc(unsigned order, unsigned long nslabs) +{ + return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); +} + +dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) +{ + return paddr; +} + +phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +{ + return baddr; +} + +int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size) +{ + return 0; +} + static dma_addr_t swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, int direction) @@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ +#ifdef CONFIG_X86_64 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; +#endif if (swiotlb_force) swiotlb = 1; if (swiotlb) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d..e68bb9e3086 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,13 +1,16 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <asm/idle.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/pm.h> #include <linux/clockchips.h> +#include <linux/ftrace.h> #include <asm/system.h> +#include <asm/apic.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -100,6 +103,9 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -112,6 +118,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(&it); } else { local_irq_enable(); /* loop is done by the caller */ @@ -122,6 +129,21 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); + disable_local_APIC(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + static void do_nothing(void *unused) { } @@ -154,24 +176,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __mwait(ax, cx); } + trace_power_end(&it); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { + struct power_trace it; if (!need_resched()) { + trace_power_start(&it, POWER_CSTATE, 1); __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(&it); } else local_irq_enable(); } @@ -183,9 +212,13 @@ static void mwait_idle(void) */ static void poll_idle(void) { + struct power_trace it; + + trace_power_start(&it, POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); + trace_power_end(&it); } /* @@ -270,7 +303,7 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d4..3ba155d2488 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@ #include <linux/percpu.h> #include <linux/prctl.h> #include <linux/dmi.h> +#include <linux/ftrace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -59,6 +60,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/smp.h> +#include <asm/ds.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -250,14 +252,8 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(current->thread.ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(current->thread.ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -339,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + + ds_copy_thread(p, current); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + return err; } @@ -419,48 +421,19 @@ int set_tsc_mode(unsigned int val) return 0; } -#ifdef CONFIG_X86_DS -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - unsigned long ds_prev = 0; - unsigned long ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, ds_next, 0); - } - return debugctl; -} -#else -static int update_debugctl(struct thread_struct *prev, - struct thread_struct *next, unsigned long debugctl) -{ - return debugctl; -} -#endif /* CONFIG_X86_DS */ - static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread; next = &next_p->thread; - debugctl = update_debugctl(prev, next, prev->debugctlmsr); - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -482,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ - - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -548,7 +512,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120fb1b..416fb9282f4 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@ #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> +#include <linux/ftrace.h> #include <asm/pgtable.h> #include <asm/system.h> @@ -52,6 +53,7 @@ #include <asm/ia32.h> #include <asm/idle.h> #include <asm/syscalls.h> +#include <asm/ds.h> asmlinkage extern void ret_from_fork(void); @@ -235,14 +237,8 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } -#ifdef CONFIG_X86_DS - /* Free any DS contexts that have not been properly released. */ - if (unlikely(t->ds_ctx)) { - /* we clear debugctl to make sure DS is not used. */ - update_debugctlmsr(0); - ds_free(t->ds_ctx); - } -#endif /* CONFIG_X86_DS */ + + ds_exit_thread(current); } void flush_thread(void) @@ -372,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (err) goto out; } + + ds_copy_thread(p, me); + + clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); + p->thread.debugctlmsr = 0; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -470,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct tss_struct *tss) { struct thread_struct *prev, *next; - unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS - { - unsigned long ds_prev = 0, ds_next = 0; - - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* - * We clear debugctl to make sure DS - * is not in use when we change it: - */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); - } - } -#endif /* CONFIG_X86_DS */ - - if (next->debugctlmsr != debugctl) + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { @@ -533,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - -#ifdef CONFIG_X86_PTRACE_BTS - if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); - - if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) - ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif /* CONFIG_X86_PTRACE_BTS */ } /* @@ -551,8 +524,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. */ -struct task_struct * +__notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10..0a5df5f82fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -581,158 +581,91 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS -/* - * The configuration for a particular BTS hardware implementation. - */ -struct bts_configuration { - /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ - unsigned char sizeof_bts; - /* the size of a field in the BTS record in bytes */ - unsigned char sizeof_field; - /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ - unsigned long debugctl_mask; -}; -static struct bts_configuration bts_cfg; - -#define BTS_MAX_RECORD_SIZE (8 * 3) - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - */ - -enum bts_field { - bts_from = 0, - bts_to, - bts_flags, - - bts_escape = (unsigned long)-1, - bts_qual = bts_to, - bts_jiffies = bts_flags -}; - -static inline unsigned long bts_get(const char *base, enum bts_field field) -{ - base += (bts_cfg.sizeof_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, enum bts_field field, unsigned long val) -{ - base += (bts_cfg.sizeof_field * field);; - (*(unsigned long *)base) = val; -} - -/* - * Translate a BTS record from the raw format into the bts_struct format - * - * out (out): bts_struct interpretation - * raw: raw BTS record - */ -static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) -{ - memset(out, 0, sizeof(*out)); - if (bts_get(raw, bts_from) == bts_escape) { - out->qualifier = bts_get(raw, bts_qual); - out->variant.jiffies = bts_get(raw, bts_jiffies); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = bts_get(raw, bts_from); - out->variant.lbr.to_ip = bts_get(raw, bts_to); - } -} - static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { - struct bts_struct ret; - const void *bts_record; - size_t bts_index, bts_end; + const struct bts_trace *trace; + struct bts_struct bts; + const unsigned char *at; int error; - error = ds_get_bts_end(child, &bts_end); - if (error < 0) - return error; - - if (bts_end <= index) - return -EINVAL; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - error = ds_get_bts_index(child, &bts_index); - if (error < 0) - return error; + at = trace->ds.top - ((index + 1) * trace->ds.size); + if ((void *)at < trace->ds.begin) + at += (trace->ds.n * trace->ds.size); - /* translate the ptrace bts index into the ds bts index */ - bts_index += bts_end - (index + 1); - if (bts_end <= bts_index) - bts_index -= bts_end; + if (!trace->read) + return -EOPNOTSUPP; - error = ds_access_bts(child, bts_index, &bts_record); + error = trace->read(child->bts, at, &bts); if (error < 0) return error; - ptrace_bts_translate_record(&ret, bts_record); - - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; - return sizeof(ret); + return sizeof(bts); } static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - struct bts_struct ret; - const unsigned char *raw; - size_t end, i; - int error; + const struct bts_trace *trace; + const unsigned char *at; + int error, drained = 0; - error = ds_get_bts_index(child, &end); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - if (size < (end * sizeof(struct bts_struct))) + if (!trace->read) + return -EOPNOTSUPP; + + if (size < (trace->ds.top - trace->ds.begin)) return -EIO; - error = ds_access_bts(child, 0, (const void **)&raw); - if (error < 0) - return error; + for (at = trace->ds.begin; (void *)at < trace->ds.top; + out++, drained++, at += trace->ds.size) { + struct bts_struct bts; + int error; - for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { - ptrace_bts_translate_record(&ret, raw); + error = trace->read(child->bts, at, &bts); + if (error < 0) + return error; - if (copy_to_user(out, &ret, sizeof(ret))) + if (copy_to_user(out, &bts, sizeof(bts))) return -EFAULT; } - error = ds_clear_bts(child); + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); + + error = ds_reset_bts(child->bts); if (error < 0) return error; - return end; + return drained; } -static void ptrace_bts_ovfl(struct task_struct *child) +static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) { - send_sig(child->thread.bts_ovfl_signal, child, 0); + child->bts_buffer = alloc_locked_buffer(size); + if (!child->bts_buffer) + return -ENOMEM; + + child->bts_size = size; + + return 0; +} + +static void ptrace_bts_free_buffer(struct task_struct *child) +{ + free_locked_buffer(child->bts_buffer, child->bts_size); + child->bts_buffer = NULL; + child->bts_size = 0; } static int ptrace_bts_config(struct task_struct *child, @@ -740,114 +673,86 @@ static int ptrace_bts_config(struct task_struct *child, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int error = 0; - - error = -EOPNOTSUPP; - if (!bts_cfg.sizeof_bts) - goto errout; + unsigned int flags = 0; - error = -EIO; if (cfg_size < sizeof(cfg)) - goto errout; + return -EIO; - error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - goto errout; + return -EFAULT; - error = -EINVAL; - if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && - !(cfg.flags & PTRACE_BTS_O_ALLOC)) - goto errout; + if (child->bts) { + ds_release_bts(child->bts); + child->bts = NULL; + } - if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = NULL; - unsigned int sig = 0; + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + return -EINVAL; - /* we ignore the error in case we were not tracing child */ - (void)ds_release_bts(child); + return -EOPNOTSUPP; - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - goto errout; + child->thread.bts_ovfl_signal = cfg.signal; + } - sig = cfg.signal; - ovfl = ptrace_bts_ovfl; - } + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && + (cfg.size != child->bts_size)) { + int error; - error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); - if (error < 0) - goto errout; + ptrace_bts_free_buffer(child); - child->thread.bts_ovfl_signal = sig; + error = ptrace_bts_allocate_buffer(child, cfg.size); + if (error < 0) + return error; } - error = -EINVAL; - if (!child->thread.ds_ctx && cfg.flags) - goto errout; - if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= bts_cfg.debugctl_mask; - else - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + flags |= BTS_USER; if (cfg.flags & PTRACE_BTS_O_SCHED) - set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - else - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + flags |= BTS_TIMESTAMPS; - error = sizeof(cfg); + child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, + /* ovfl = */ NULL, /* th = */ (size_t)-1, + flags); + if (IS_ERR(child->bts)) { + int error = PTR_ERR(child->bts); -out: - if (child->thread.debugctlmsr) - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + ptrace_bts_free_buffer(child); + child->bts = NULL; - return error; + return error; + } -errout: - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - goto out; + return sizeof(cfg); } static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + const struct bts_trace *trace; struct ptrace_bts_config cfg; - size_t end; - const void *base, *max; - int error; if (cfg_size < sizeof(cfg)) return -EIO; - error = ds_get_bts_end(child, &end); - if (error < 0) - return error; - - error = ds_access_bts(child, /* index = */ 0, &base); - if (error < 0) - return error; - - error = ds_access_bts(child, /* index = */ end, &max); - if (error < 0) - return error; + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; memset(&cfg, 0, sizeof(cfg)); - cfg.size = (max - base); + cfg.size = trace->ds.end - trace->ds.begin; cfg.signal = child->thread.bts_ovfl_signal; cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & bts_cfg.debugctl_mask) + if (trace->ds.flags & BTS_USER) cfg.flags |= PTRACE_BTS_O_TRACE; - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + if (trace->ds.flags & BTS_TIMESTAMPS) cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) @@ -856,110 +761,77 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } -static int ptrace_bts_write_record(struct task_struct *child, - const struct bts_struct *in) +static int ptrace_bts_clear(struct task_struct *child) { - unsigned char bts_record[BTS_MAX_RECORD_SIZE]; + const struct bts_trace *trace; - BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - memset(bts_record, 0, bts_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; + memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - case BTS_BRANCH: - bts_set(bts_record, bts_from, in->variant.lbr.from_ip); - bts_set(bts_record, bts_to, in->variant.lbr.to_ip); - break; + return ds_reset_bts(child->bts); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - bts_set(bts_record, bts_from, bts_escape); - bts_set(bts_record, bts_qual, in->qualifier); - bts_set(bts_record, bts_jiffies, in->variant.jiffies); - break; +static int ptrace_bts_size(struct task_struct *child) +{ + const struct bts_trace *trace; - default: - return -EINVAL; - } + trace = ds_read_bts(child->bts); + if (!trace) + return -EPERM; - /* The writing task will be the switched-to task on a context - * switch. It needs to write into the switched-from task's BTS - * buffer. */ - return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); + return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -void ptrace_bts_take_timestamp(struct task_struct *tsk, - enum bts_qualifier qualifier) +static void ptrace_bts_fork(struct task_struct *tsk) { - struct bts_struct rec = { - .qualifier = qualifier, - .variant.jiffies = jiffies_64 - }; - - ptrace_bts_write_record(tsk, &rec); + tsk->bts = NULL; + tsk->bts_buffer = NULL; + tsk->bts_size = 0; + tsk->thread.bts_ovfl_signal = 0; } -static const struct bts_configuration bts_cfg_netburst = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<2)|(1<<3)|(1<<5) -}; +static void ptrace_bts_untrace(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; + + /* We cannot update total_vm and locked_vm since + child's mm is already gone. But we can reclaim the + memory. */ + kfree(child->bts_buffer); + child->bts_buffer = NULL; + child->bts_size = 0; + } +} -static const struct bts_configuration bts_cfg_pentium_m = { - .sizeof_bts = sizeof(long) * 3, - .sizeof_field = sizeof(long), - .debugctl_mask = (1<<6)|(1<<7) -}; +static void ptrace_bts_detach(struct task_struct *child) +{ + if (unlikely(child->bts)) { + ds_release_bts(child->bts); + child->bts = NULL; -static const struct bts_configuration bts_cfg_core2 = { - .sizeof_bts = 8 * 3, - .sizeof_field = 8, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) -}; + ptrace_bts_free_buffer(child); + } +} +#else +static inline void ptrace_bts_fork(struct task_struct *tsk) {} +static inline void ptrace_bts_detach(struct task_struct *child) {} +static inline void ptrace_bts_untrace(struct task_struct *child) {} +#endif /* CONFIG_X86_PTRACE_BTS */ -static inline void bts_configure(const struct bts_configuration *cfg) +void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) { - bts_cfg = *cfg; + ptrace_bts_fork(child); } -void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +void x86_ptrace_untrace(struct task_struct *child) { - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0xD: - case 0xE: /* Pentium M */ - bts_configure(&bts_cfg_pentium_m); - break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ - bts_configure(&bts_cfg_core2); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - case 0xF: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - bts_configure(&bts_cfg_netburst); - break; - default: - /* sorry, don't know about them */ - break; - } - break; - default: - /* sorry, don't know about them */ - break; - } + ptrace_bts_untrace(child); } -#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -972,15 +844,7 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif -#ifdef CONFIG_X86_PTRACE_BTS - (void)ds_release_bts(child); - - child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); -#endif /* CONFIG_X86_PTRACE_BTS */ + ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1112,7 +976,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ NULL); + ret = ptrace_bts_size(child); break; case PTRACE_BTS_GET: @@ -1121,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ds_clear_bts(child); + ret = ptrace_bts_clear(child); break; case PTRACE_BTS_DRAIN: @@ -1384,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: +#ifdef CONFIG_X86_PTRACE_BTS + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: +#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 67465ed8931..309949e9e1c 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, ich_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index cc5a2545dd4..2b46eb41643 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,8 @@ #include <asm/proto.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#include <asm/pci_x86.h> +#include <asm/virtext.h> #ifdef CONFIG_X86_32 # include <linux/dmi.h> @@ -21,6 +23,8 @@ # include <asm/iommu.h> #endif +#include <mach_ipi.h> + /* * Power off function, if any */ @@ -36,7 +40,16 @@ int reboot_force; static int reboot_cpu = -1; #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] warm Don't set the cold reboot flag cold Set the cold reboot flag bios Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +58,7 @@ static int reboot_cpu = -1; kbd Use the keyboard controller. cold reset (default) acpi Use the RESET_REG in the FADT efi Use efi reset_system runtime service + pci Use the so-called "PCI reset register", CF9 force Avoid anything that could hang. */ static int __init reboot_setup(char *str) @@ -79,6 +93,7 @@ static int __init reboot_setup(char *str) case 'k': case 't': case 'e': + case 'p': reboot_type = *str; break; @@ -360,6 +375,48 @@ static inline void kb_wait(void) } } +static void vmxoff_nmi(int cpu, struct die_args *args) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization + */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. + */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + void __attribute__((weak)) mach_reboot_fixups(void) { } @@ -368,6 +425,9 @@ static void native_machine_emergency_restart(void) { int i; + if (reboot_emergency) + emergency_vmx_disable_all(); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -404,12 +464,27 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; - case BOOT_EFI: if (efi_enabled) - efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + efi.reset_system(reboot_mode ? + EFI_RESET_WARM : + EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_KBD; + break; + + case BOOT_CF9: + port_cf9_safe = true; + /* fall through */ + case BOOT_CF9_COND: + if (port_cf9_safe) { + u8 cf9 = inb(0xcf9) & ~6; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + outb(cf9|6, 0xcf9); /* Actually do the reset */ + udelay(50); + } reboot_type = BOOT_KBD; break; } @@ -426,7 +501,7 @@ void native_machine_shutdown(void) #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ - if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && + if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; #endif @@ -436,7 +511,7 @@ void native_machine_shutdown(void) reboot_cpu_id = smp_processor_id(); /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); + set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* O.K Now that I'm on the appropriate processor, * stop all of the others. @@ -459,17 +534,28 @@ void native_machine_shutdown(void) #endif } +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + static void native_machine_restart(char *__unused) { printk("machine restart\n"); if (!reboot_force) machine_shutdown(); - machine_emergency_restart(); + __machine_emergency_restart(0); } static void native_machine_halt(void) { + /* stop other cpus and apics */ + machine_shutdown(); + + /* stop this cpu */ + stop_this_cpu(NULL); } static void native_machine_power_off(void) @@ -504,7 +590,7 @@ void machine_shutdown(void) void machine_emergency_restart(void) { - machine_ops.emergency_restart(); + __machine_emergency_restart(1); } void machine_restart(char *cmd) @@ -523,3 +609,92 @@ void machine_crash_shutdown(struct pt_regs *regs) machine_ops.crash_shutdown(regs); } #endif + + +#if defined(CONFIG_SMP) + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + cpu = raw_smp_processor_id(); + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + shootdown_callback(cpu, (struct die_args *)data); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(NMI_VECTOR); +} + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +/* Halt all other CPUs, calling the specified function on each of them + * + * This function can be used to halt all other CPUs on crash + * or emergency reboot time. The function passed as parameter + * will be called inside a NMI handler on all CPUs. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ +} +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} +#endif diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 6f50664b2ba..a160f311972 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -10,15 +10,12 @@ #include <asm/page.h> #include <asm/kexec.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> /* * Must be relocatable PIC code callable as a C function */ #define PTR(x) (x << 2) -#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define PAE_PGD_ATTR (_PAGE_PRESENT) /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE * ~ control_page + PAGE_SIZE are used as data storage and stack for @@ -39,7 +36,6 @@ #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) .text - .align PAGE_SIZE .globl relocate_kernel relocate_kernel: /* Save the CPU context, used for jumping back */ @@ -60,117 +56,6 @@ relocate_kernel: movl %cr4, %eax movl %eax, CR4(%edi) -#ifdef CONFIG_X86_PAE - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_0)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xc0000000, %eax - shrl $27, %eax - addl %edi, %eax - - movl PTR(PA_PMD_1)(%ebp), %edx - orl $PAE_PGD_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PMD_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x3fe00000, %eax - shrl $18, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x001ff000, %eax - shrl $9, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#else - /* map the control page at its virtual address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_0)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_0)(%ebp), %edi - movl PTR(VA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - /* identity map the control page at its physical address */ - - movl PTR(VA_PGD)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0xffc00000, %eax - shrl $20, %eax - addl %edi, %eax - - movl PTR(PA_PTE_1)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) - - movl PTR(VA_PTE_1)(%ebp), %edi - movl PTR(PA_CONTROL_PAGE)(%ebp), %eax - andl $0x003ff000, %eax - shrl $10, %eax - addl %edi, %eax - - movl PTR(PA_CONTROL_PAGE)(%ebp), %edx - orl $PAGE_ATTR, %edx - movl %edx, (%eax) -#endif - -relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 20+4(%esp), %ebx /* page_list */ movl 20+8(%esp), %ebp /* list of pages */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6c..ae0d8042cf6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -93,11 +93,13 @@ #include <asm/desc.h> #include <asm/dma.h> #include <asm/iommu.h> +#include <asm/gart.h> #include <asm/mmu_context.h> #include <asm/proto.h> #include <mach_apic.h> #include <asm/paravirt.h> +#include <asm/hypervisor.h> #include <asm/percpu.h> #include <asm/topology.h> @@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ +static unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ @@ -583,161 +586,24 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - -/* - * Some BIOSes seem to corrupt the low 64k of memory during events - * like suspend/resume and unplugging an HDMI cable. Reserve all - * remaining free memory in that area and fill it with a distinct - * pattern. - */ -#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION -#define MAX_SCAN_AREAS 8 - -static int __read_mostly memory_corruption_check = -1; - -static unsigned __read_mostly corruption_check_size = 64*1024; -static unsigned __read_mostly corruption_check_period = 60; /* seconds */ - -static struct e820entry scan_areas[MAX_SCAN_AREAS]; -static int num_scan_areas; - - -static int set_corruption_check(char *arg) +static int __init default_update_genapic(void) { - char *end; - - memory_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check", set_corruption_check); - -static int set_corruption_check_period(char *arg) -{ - char *end; - - corruption_check_period = simple_strtoul(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_period", set_corruption_check_period); - -static int set_corruption_check_size(char *arg) -{ - char *end; - unsigned size; - - size = memparse(arg, &end); - - if (*end == '\0') - corruption_check_size = size; - - return (size == corruption_check_size) ? 0 : -EINVAL; -} -early_param("memory_corruption_check_size", set_corruption_check_size); - - -static void __init setup_bios_corruption_check(void) -{ - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - - if (memory_corruption_check == -1) { - memory_corruption_check = -#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK - 1 -#else - 0 +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) + genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif #endif - ; - } - - if (corruption_check_size == 0) - memory_corruption_check = 0; - - if (!memory_corruption_check) - return; - - corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - - while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = find_e820_area_size(addr, &size, PAGE_SIZE); - - if (addr == 0) - break; - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - if (size == 0) - break; - - e820_update_range(addr, size, E820_RAM, E820_RESERVED); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; - - /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); - - addr += size; - } - - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); - update_e820(); -} - -static struct timer_list periodic_check_timer; - -void check_for_bios_corruption(void) -{ - int i; - int corruption = 0; - - if (!memory_corruption_check) - return; - - for(i = 0; i < num_scan_areas; i++) { - unsigned long *addr = __va(scan_areas[i].addr); - unsigned long size = scan_areas[i].size; - - for(; size; addr++, size -= sizeof(unsigned long)) { - if (!*addr) - continue; - printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", - addr, __pa(addr), *addr); - corruption = 1; - *addr = 0; - } - } - - WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); -} - -static void periodic_check_for_corruption(unsigned long data) -{ - check_for_bios_corruption(); - mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); + return 0; } -void start_periodic_check_for_corruption(void) -{ - if (!memory_corruption_check || corruption_check_period == 0) - return; - - printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", - corruption_check_period); +static struct x86_quirks default_x86_quirks __initdata = { + .update_genapic = default_update_genapic, +}; - init_timer(&periodic_check_timer); - periodic_check_timer.function = &periodic_check_for_corruption; - periodic_check_for_corruption(0); -} -#endif +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +#ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE @@ -749,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) return 0; } +#endif /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { @@ -794,6 +661,9 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + /* VMI may relocate the fixmap; do this before touching ioremap area */ + vmi_init(); + early_cpu_init(); early_ioremap_init(); @@ -880,13 +750,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif + /* Must be before kernel pagetables are setup */ + vmi_activate(); /* after early param, so could get panic from serial */ reserve_early_setup_data(); @@ -909,6 +774,12 @@ void __init setup_arch(char **cmdline_p) dmi_check_system(bad_bios_dmi_table); + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor(&boot_cpu_data); + #ifdef CONFIG_X86_32 probe_roms(); #endif @@ -1082,7 +953,7 @@ void __init setup_arch(char **cmdline_p) ioapic_init_mappings(); /* need to wait for io_apic is mapped */ - nr_irqs = probe_nr_irqs(); + probe_nr_irqs_gsi(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ae0c0d3bb77..a4b619c3310 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -152,8 +152,11 @@ void __init setup_per_cpu_areas(void) old_size = PERCPU_ENOUGH_ROOM; align = max_t(unsigned long, PAGE_SIZE, align); size = roundup(old_size, align); - printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", - size); + + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + + pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -164,28 +167,21 @@ void __init setup_per_cpu_areas(void) if (!node_online(node) || !NODE_DATA(node)) { ptr = __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); - printk(KERN_INFO - "cpu %d has no node %d or node-local memory\n", + pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); - if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", - cpu, __pa(ptr)); - } - else { + pr_debug("per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); + } else { ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, __pa(MAX_DMA_ADDRESS)); - if (ptr) - printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", - cpu, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d on node%d at %016lx\n", + cpu, node, __pa(ptr)); } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } - printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", - NR_CPUS, nr_cpu_ids, nr_node_ids); - /* Setup percpu data maps */ setup_per_cpu_maps(); @@ -282,7 +278,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) else cpu_clear(cpu, *mask); - cpulist_scnprintf(buf, sizeof(buf), *mask); + cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf); } @@ -334,25 +330,25 @@ static const cpumask_t cpu_mask_none; /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ -const cpumask_t *_node_to_cpumask_ptr(int node) +const cpumask_t *cpumask_of_node(int node) { if (node_to_cpumask_map == NULL) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", + "cpumask_of_node(%d): no node_to_cpumask_map!\n", node); dump_stack(); return (const cpumask_t *)&cpu_online_map; } if (node >= nr_node_ids) { printk(KERN_WARNING - "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", node, nr_node_ids); dump_stack(); return &cpu_mask_none; } return &node_to_cpumask_map[node]; } -EXPORT_SYMBOL(_node_to_cpumask_ptr); +EXPORT_SYMBOL(cpumask_of_node); /* * Returns a bitmask of CPUs on Node 'node'. diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h deleted file mode 100644 index cc673aa55ce..00000000000 --- a/arch/x86/kernel/sigframe.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifdef CONFIG_X86_32 -struct sigframe { - char __user *pretcode; - int sig; - struct sigcontext sc; - /* - * fpstate is unused. fpstate is moved/allocated after - * retcode[] below. This movement allows to have the FP state and the - * future state extensions (xsave) stay together. - * And at the same time retaining the unused fpstate, prevents changing - * the offset of extramask[] in the sigframe and thus prevent any - * legacy application accessing/modifying it. - */ - struct _fpstate fpstate_unused; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; - /* fp state follows here */ -}; - -struct rt_sigframe { - char __user *pretcode; - int sig; - struct siginfo __user *pinfo; - void __user *puc; - struct siginfo info; - struct ucontext uc; - char retcode[8]; - /* fp state follows here */ -}; -#else -struct rt_sigframe { - char __user *pretcode; - struct ucontext uc; - struct siginfo info; - /* fp state follows here */ -}; - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs *regs); -#endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c index d6dd057d0f2..89bb7668041 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal.c @@ -1,36 +1,41 @@ /* * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen */ -#include <linux/list.h> -#include <linux/personality.h> -#include <linux/binfmts.h> -#include <linux/suspend.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> #include <linux/kernel.h> -#include <linux/ptrace.h> #include <linux/signal.h> -#include <linux/stddef.h> -#include <linux/unistd.h> #include <linux/errno.h> -#include <linux/sched.h> #include <linux/wait.h> +#include <linux/ptrace.h> #include <linux/tracehook.h> -#include <linux/elf.h> -#include <linux/smp.h> -#include <linux/mm.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/ucontext.h> -#include <asm/uaccess.h> #include <asm/i387.h> #include <asm/vdso.h> + +#ifdef CONFIG_X86_64 +#include <asm/proto.h> +#include <asm/ia32_unistd.h> +#include <asm/mce.h> +#endif /* CONFIG_X86_64 */ + #include <asm/syscall.h> #include <asm/syscalls.h> -#include "sigframe.h" +#include <asm/sigframe.h> #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -45,74 +50,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} - -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} - #define COPY(x) { \ err |= __get_user(regs->x, &sc->x); \ } @@ -123,7 +60,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx) regs->seg = tmp; \ } -#define COPY_SEG_STRICT(seg) { \ +#define COPY_SEG_CPL3(seg) { \ unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ regs->seg = tmp | 3; \ @@ -135,9 +72,6 @@ asmlinkage int sys_sigaltstack(unsigned long bx) loadsegment(seg, tmp); \ } -/* - * Do a signal return; undo the signal stack. - */ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) @@ -149,14 +83,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; +#ifdef CONFIG_X86_32 GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); - COPY_SEG_STRICT(cs); - COPY_SEG_STRICT(ss); + +#ifdef CONFIG_X86_64 + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_32 + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ err |= __get_user(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -169,102 +125,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) -{ - struct sigframe __user *frame; - struct pt_regs *regs; - unsigned long ax; - sigset_t set; - - regs = (struct pt_regs *) &__unused; - frame = (struct sigframe __user *)(regs->sp - 8); - - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->sc, &ax)) - goto badframe; - return ax; - -badframe: - if (show_unhandled_signals && printk_ratelimit()) { - printk("%s%s[%d] bad frame in sigreturn frame:" - "%p ip:%lx sp:%lx oeax:%lx", - task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->ip, - regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, current); - - return 0; -} - -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} - -/* - * Set up a signal frame. - */ static int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { - int tmp, err = 0; + int err = 0; - err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); +#ifdef CONFIG_X86_32 + { + unsigned int tmp; + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + } + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); err |= __put_user(regs->es, (unsigned int __user *)&sc->es); err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ + err |= __put_user(regs->di, &sc->di); err |= __put_user(regs->si, &sc->si); err |= __put_user(regs->bp, &sc->bp); @@ -273,19 +151,33 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, err |= __put_user(regs->dx, &sc->dx); err |= __put_user(regs->cx, &sc->cx); err |= __put_user(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 + err |= __put_user(regs->r8, &sc->r8); + err |= __put_user(regs->r9, &sc->r9); + err |= __put_user(regs->r10, &sc->r10); + err |= __put_user(regs->r11, &sc->r11); + err |= __put_user(regs->r12, &sc->r12); + err |= __put_user(regs->r13, &sc->r13); + err |= __put_user(regs->r14, &sc->r14); + err |= __put_user(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); err |= __put_user(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); err |= __put_user(regs->flags, &sc->flags); err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(0, &sc->gs); + err |= __put_user(0, &sc->fs); +#endif /* CONFIG_X86_32 */ - tmp = save_i387_xstate(fpstate); - if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); + err |= __put_user(fpstate, &sc->fpstate); /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); @@ -295,6 +187,32 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, } /* + * Set up a signal frame. + */ +#ifdef CONFIG_X86_32 +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + +/* * Determine which stack to use.. */ static inline void __user * @@ -328,6 +246,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, if (used_math()) { sp = sp - sig_xstate_size; *fpstate = (struct _fpstate *) sp; + if (save_i387_xstate(*fpstate) < 0) + return (void __user *)-1L; } sp -= frame_size; @@ -383,9 +303,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); - err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -454,9 +372,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * reasons and because gdb uses it as a signature to notice * signal handler stack frames. */ - err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); - err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode); if (err) return -EFAULT; @@ -475,23 +391,293 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return 0; } +#else /* !CONFIG_X86_32 */ +/* + * Determine which stack to use.. + */ +static void __user * +get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size) +{ + /* Default to using normal stack - redzone*/ + sp -= 128; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; + } + + return (void __user *)round_down(sp - size, 64); +} + +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *fp = NULL; + int err = 0; + struct task_struct *me = current; + + if (used_math()) { + fp = get_stack(ka, regs->sp, sig_xstate_size); + frame = (void __user *)round_down( + (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; + + if (save_i387_xstate(fp) < 0) + return -EFAULT; + } else + frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8; + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ka->sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, info)) + return -EFAULT; + } + + /* Create the ucontext. */ + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->sp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) { + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + } else { + /* could use a vstub here */ + return -EFAULT; + } + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->di = sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + return 0; +} +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_32 +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +asmlinkage int +sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + + return -ERESTARTNOHAND; +} + +asmlinkage int +sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_32 +asmlinkage int sys_sigaltstack(unsigned long bx) +{ + /* + * This is needed to make gcc realize it doesn't own the + * "struct pt_regs" + */ + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long +sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) +{ + return do_sigaltstack(uss, uoss, regs->sp); +} +#endif /* CONFIG_X86_32 */ + +/* + * Do a signal return; undo the signal stack. + */ +#ifdef CONFIG_X86_32 +asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +{ + struct sigframe __user *frame; + struct pt_regs *regs; + unsigned long ax; + sigset_t set; + + regs = (struct pt_regs *) &__unused; + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->sc, &ax)) + goto badframe; + return ax; + +badframe: + signal_fault(regs, frame, "sigreturn"); + + return 0; +} +#endif /* CONFIG_X86_32 */ + +static long do_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage int sys_rt_sigreturn(struct pt_regs regs) +{ + return do_rt_sigreturn(®s); +} +#else /* !CONFIG_X86_32 */ +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} +#endif /* CONFIG_X86_32 */ /* * OK, we're invoking a handler: */ static int signr_convert(int sig) { +#ifdef CONFIG_X86_32 struct thread_info *info = current_thread_info(); if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) return info->exec_domain->signal_invmap[sig]; +#endif /* CONFIG_X86_32 */ return sig; } +#ifdef CONFIG_X86_32 + #define is_ia32 1 #define ia32_setup_frame __setup_frame #define ia32_setup_rt_frame __setup_rt_frame +#else /* !CONFIG_X86_32 */ + +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else /* !CONFIG_IA32_EMULATION */ +#define is_ia32 0 +#endif /* CONFIG_IA32_EMULATION */ + +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); + +#endif /* CONFIG_X86_32 */ + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) @@ -592,7 +778,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#ifdef CONFIG_X86_32 #define NR_restart_syscall __NR_restart_syscall +#else /* !CONFIG_X86_32 */ +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall +#endif /* CONFIG_X86_32 */ + /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -704,8 +896,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO + printk("%s" "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c deleted file mode 100644 index a5c9627f4db..00000000000 --- a/arch/x86/kernel/signal_64.c +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/tracehook.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/compiler.h> -#include <linux/uaccess.h> - -#include <asm/processor.h> -#include <asm/ucontext.h> -#include <asm/i387.h> -#include <asm/proto.h> -#include <asm/ia32_unistd.h> -#include <asm/mce.h> -#include <asm/syscall.h> -#include <asm/syscalls.h> -#include "sigframe.h" - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ - X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ - X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ - X86_EFLAGS_CF) - -#ifdef CONFIG_X86_32 -# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) -#else -# define FIX_EFLAGS __FIX_EFLAGS -#endif - -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->sp); -} - -#define COPY(x) { \ - err |= __get_user(regs->x, &sc->x); \ -} - -#define COPY_SEG_STRICT(seg) { \ - unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} - -/* - * Do a signal return; undo the signal stack. - */ -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) -{ - void __user *buf; - unsigned int tmpflags; - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); - - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_STRICT(cs); - - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - - err |= __get_user(*pax, &sc->ax); - return err; -} - -static long do_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - unsigned long ax; - sigset_t set; - - frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); - if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) - goto badframe; - - return ax; - -badframe: - signal_fault(regs, frame, "rt_sigreturn"); - return 0; -} - -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - return do_rt_sigreturn(regs); -} - -/* - * Set up a signal frame. - */ - -static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, - unsigned long mask, struct task_struct *me) -{ - int err = 0; - - err |= __put_user(regs->cs, &sc->cs); - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); - - err |= __put_user(regs->di, &sc->di); - err |= __put_user(regs->si, &sc->si); - err |= __put_user(regs->bp, &sc->bp); - err |= __put_user(regs->sp, &sc->sp); - err |= __put_user(regs->bx, &sc->bx); - err |= __put_user(regs->dx, &sc->dx); - err |= __put_user(regs->cx, &sc->cx); - err |= __put_user(regs->ax, &sc->ax); - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); - err |= __put_user(me->thread.trap_no, &sc->trapno); - err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->ip, &sc->ip); - err |= __put_user(regs->flags, &sc->flags); - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(me->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ - -static void __user * -get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) -{ - unsigned long sp; - - /* Default to using normal stack - redzone*/ - sp = regs->sp - 128; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) - sp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(sp - size, 64); -} - -static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - void __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs, sig_xstate_size); - frame = (void __user *)round_down( - (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (save_i387_xstate(fp) < 0) - return -EFAULT; - } else - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - return -EFAULT; - - if (ka->sa.sa_flags & SA_SIGINFO) { - if (copy_siginfo_to_user(&frame->info, info)) - return -EFAULT; - } - - /* Create the ucontext. */ - if (cpu_has_xsave) - err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); - else - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->sp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); - err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - return -EFAULT; - } - - if (err) - return -EFAULT; - - /* Set up registers for signal handler */ - regs->di = sig; - /* In case the signal handler was declared without prototypes */ - regs->ax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->si = (unsigned long)&frame->info; - regs->dx = (unsigned long)&frame->uc; - regs->ip = (unsigned long) ka->sa.sa_handler; - - regs->sp = (unsigned long)frame; - - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ - regs->cs = __USER_CS; - - return 0; -} - -/* - * OK, we're invoking a handler - */ -static int signr_convert(int sig) -{ - return sig; -} - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32 test_thread_flag(TIF_IA32) -#else -#define is_ia32 0 -#endif - -static int -setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) -{ - int usig = signr_convert(sig); - int ret; - - /* Set up the stack frame */ - if (is_ia32) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(usig, ka, info, set, regs); - else - ret = ia32_setup_frame(usig, ka, set, regs); - } else - ret = __setup_rt_frame(sig, ka, info, set, regs); - - if (ret) { - force_sigsegv(sig, current); - return -EFAULT; - } - - return ret; -} - -static int -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ - int ret; - - /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* If so, check system call restarting.. */ - switch (syscall_get_error(current, regs)) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->ax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->ax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - } - } - - /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. - */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; - - ret = setup_rt_frame(sig, ka, info, oldset, regs); - - if (ret) - return ret; - -#ifdef CONFIG_X86_64 - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); -#endif - - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; - - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - - return 0; -} - -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -static void do_signal(struct pt_regs *regs) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - sigset_t *oldset; - - /* - * We want the common case to go fast, which is why we may in certain - * cases get here from kernel mode. Just return without doing anything - * if so. - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ - if (!user_mode(regs)) - return; - - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - - /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } - return; - } - - /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { - /* Restart the system call - no handlers present */ - switch (syscall_get_error(current, regs)) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->ax = regs->orig_ax; - regs->ip -= 2; - break; - - case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; - regs->ip -= 2; - break; - } - } - - /* - * If there's no signal to deliver, we just put the saved sigmask - * back. - */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } -} - -/* - * notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ -#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) - /* notify userspace of pending MCEs */ - if (thread_info_flags & _TIF_MCE_NOTIFY) - mce_notify_user(); -#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - -#ifdef CONFIG_X86_32 - clear_thread_flag(TIF_IRET); -#endif /* CONFIG_X86_32 */ -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - - if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, - regs->ip, regs->sp, regs->orig_ax); - print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); - } - - force_sig(SIGSEGV, me); -} diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8..beea2649a24 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -118,41 +118,28 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); + send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } void native_send_call_func_single_ipi(int cpu) { - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR); + send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); } -void native_send_call_func_ipi(cpumask_t mask) +void native_send_call_func_ipi(const struct cpumask *mask) { cpumask_t allbutself; allbutself = cpu_online_map; cpu_clear(smp_processor_id(), allbutself); - if (cpus_equal(mask, allbutself) && + if (cpus_equal(*mask, allbutself) && cpus_equal(cpu_online_map, cpu_callout_map)) send_IPI_allbutself(CALL_FUNCTION_VECTOR); else send_IPI_mask(mask, CALL_FUNCTION_VECTOR); } -static void stop_this_cpu(void *dummy) -{ - local_irq_disable(); - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - disable_local_APIC(); - if (hlt_works(smp_processor_id())) - for (;;) halt(); - for (;;); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -178,11 +165,7 @@ static void native_smp_send_stop(void) void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_resched_count++; -#else - add_pda(irq_resched_count, 1); -#endif + inc_irq_stat(irq_resched_count); } void smp_call_function_interrupt(struct pt_regs *regs) @@ -190,11 +173,7 @@ void smp_call_function_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } @@ -203,11 +182,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) ack_APIC_irq(); irq_enter(); generic_smp_call_function_single_interrupt(); -#ifdef CONFIG_X86_32 - __get_cpu_var(irq_stat).irq_call_count++; -#else - add_pda(irq_call_count, 1); -#endif + inc_irq_stat(irq_call_count); irq_exit(); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b109339731..6bd4d9b7387 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include <asm/mtrr.h> #include <asm/vmi.h> #include <asm/genapic.h> +#include <asm/setup.h> #include <linux/mc146818rtc.h> #include <mach_apic.h> @@ -101,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; -/* bitmap of online cpus */ -cpumask_t cpu_online_map __read_mostly; -EXPORT_SYMBOL(cpu_online_map); - cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -cpumask_t cpu_possible_map; -EXPORT_SYMBOL(cpu_possible_map); /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); @@ -287,16 +282,14 @@ static int __cpuinitdata unsafe_smp; /* * Activate a secondary processor. */ -static void __cpuinit start_secondary(void *unused) +notrace static void __cpuinit start_secondary(void *unused) { /* * Don't put *anything* before cpu_init(), SMP booting is too * fragile that we want to limit the things done here to the * most necessary things. */ -#ifdef CONFIG_VMI vmi_bringup(); -#endif cpu_init(); preempt_disable(); smp_callin(); @@ -503,7 +496,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) } /* maps the cpu to the sched domain representing multi-core */ -cpumask_t cpu_coregroup_map(int cpu) +const struct cpumask *cpu_coregroup_mask(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); /* @@ -511,9 +504,14 @@ cpumask_t cpu_coregroup_map(int cpu) * And for power savings, we return cpu_core_map */ if (sched_mc_power_savings || sched_smt_power_savings) - return per_cpu(cpu_core_map, cpu); + return &per_cpu(cpu_core_map, cpu); else - return c->llc_shared_map; + return &c->llc_shared_map; +} + +cpumask_t cpu_coregroup_map(int cpu) +{ + return *cpu_coregroup_mask(cpu); } static void impress_friends(void) @@ -536,7 +534,7 @@ static void impress_friends(void) pr_debug("Before bogocount - setting activated=1.\n"); } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; @@ -575,14 +573,13 @@ static inline void __inquire_remote_apic(int apicid) } } -#ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -599,7 +596,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -614,11 +611,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT -static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; @@ -737,7 +732,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -#endif /* WAKE_SECONDARY_VIA_INIT */ struct create_idle { struct work_struct work; @@ -1086,8 +1080,10 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING "weird, boot CPU (#%d) not listed" - "by the BIOS.\n", hard_smp_processor_id()); + printk(KERN_WARNING + "weird, boot CPU (#%d) not listed by the BIOS.\n", + hard_smp_processor_id()); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); } @@ -1158,7 +1154,7 @@ static void __init smp_cpu_index_default(void) for_each_possible_cpu(i) { c = &cpu_data(i); /* mark all to hotplug */ - c->cpu_index = NR_CPUS; + c->cpu_index = nr_cpu_ids; } } @@ -1263,6 +1259,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1275,7 +1280,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) * * Three ways to find out the number of additional hotplug CPUs: * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with additional_cpus=NUM + * - The user can overwrite it with possible_cpus=NUM * - Otherwise don't reserve additional CPUs. * We do this because additional CPUs waste a lot of memory. * -AK @@ -1288,9 +1293,19 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; - possible = num_processors + disabled_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (setup_possible_cpus == -1) + possible = num_processors + disabled_cpus; + else + possible = setup_possible_cpus; + + total_cpus = max_t(int, possible, num_processors + disabled_cpus); + + if (possible > CONFIG_NR_CPUS) { + printk(KERN_WARNING + "%d Processors exceeds NR_CPUS limit of %d\n", + possible, CONFIG_NR_CPUS); + possible = CONFIG_NR_CPUS; + } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); @@ -1355,7 +1370,7 @@ void cpu_disable_common(void) lock_vector_lock(); remove_cpu_from_maps(cpu); unlock_vector_lock(); - fixup_irqs(cpu_online_map); + fixup_irqs(); } int native_cpu_disable(void) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c..10786af9554 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/stacktrace.h> #include <linux/module.h> +#include <linux/uaccess.h> #include <asm/stacktrace.h> static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 77b400f06ea..65309e4cb1c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc); irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ - per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; + inc_irq_stat(irq0_irqs); #ifdef CONFIG_X86_IO_APIC if (timer_ack) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d650c21..891e7a7c433 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { - add_pda(irq0_irqs, 1); + inc_irq_stat(irq0_irqs); global_clock_event->event_handler(global_clock_event); @@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void) break; no_ctr_free = (i == 4); if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); i = 3; rdmsrl(MSR_K7_EVNTSEL3, evntsel3); wrmsrl(MSR_K7_EVNTSEL3, 0); diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index f4049f3513b..ce505464224 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock); */ void leave_mm(int cpu) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); + BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK); + cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { + if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) { + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) { if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - __get_cpu_var(irq_stat).irq_tlb_count++; + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@ -164,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR); while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ @@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) + if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 8f919ca6949..f8be6f1d2e4 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); - add_pda(irq_tlb_count, 1); + inc_irq_stat(irq_tlb_count); } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender); while (!cpus_empty(f->flush_cpumask)) cpu_relax(); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 04431f34fd1..f885023167e 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -566,14 +566,10 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - if (!proc_mkdir("sgi_uv", NULL)) - return -EINVAL; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); - remove_proc_entry("sgi_uv", NULL); return -EINVAL; } proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; @@ -586,7 +582,6 @@ static int __init uv_ptc_init(void) static struct bau_control * __init uv_table_bases_init(int blade, int node) { int i; - int *ip; struct bau_msg_status *msp; struct bau_control *bau_tabp; @@ -603,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) bau_cpubits_clear(&msp->seen_by, (int) uv_blade_nr_possible_cpus(blade)); - bau_tabp->watching = - kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); - BUG_ON(!bau_tabp->watching); - - for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) - *ip = 0; - uv_bau_table_bases[blade] = bau_tabp; return bau_tabp; @@ -632,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu, bcp->bau_msg_head = bau_tablesp->va_queue_first; bcp->va_queue_first = bau_tablesp->va_queue_first; bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->watching = bau_tablesp->watching; bcp->msg_statuses = bau_tablesp->msg_statuses; bcp->descriptor_base = adp; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 1106fac6024..808031a5ba1 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,10 +1,26 @@ #include <linux/io.h> #include <asm/trampoline.h> +#include <asm/e820.h> /* ready for x86_64 and x86 */ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +void __init reserve_trampoline_memory(void) +{ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); +#endif + /* Has to be in very low memory so we can execute real-mode AP code. */ + reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, + "TRAMPOLINE"); +} + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); */ unsigned long setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); + memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 04d242ab016..ce6650eb64e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -72,9 +72,6 @@ #include "cpu/mcheck/mce.h" -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -89,6 +86,9 @@ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + static int ignore_nmis; static inline void conditional_sti(struct pt_regs *regs) @@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_no = 8; - /* This is always a kernel trap and never fixable (and thus must - never return). */ + /* + * This is always a kernel trap and never fixable (and thus must + * never return). + */ for (;;) die(str, regs, error_code); } @@ -481,11 +483,7 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif + inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); @@ -524,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) } #ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ +/* + * Help handler running on IST stack to switch back to user stack + * for scheduling or signal handling. The actual stack switch is done in + * entry.S + */ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; @@ -536,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ + /* + * Exception from kernel and interrupts are enabled. Move to + * kernel process stack. + */ else if (eregs->flags & X86_EFLAGS_IF) regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) @@ -664,7 +666,7 @@ void math_error(void __user *ip) { struct task_struct *task; siginfo_t info; - unsigned short cwd, swd; + unsigned short cwd, swd, err; /* * Save the info for the exception handler and clear the error. @@ -675,7 +677,6 @@ void math_error(void __user *ip) task->thread.error_code = 0; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_code = __SI_FAULT; info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked @@ -689,34 +690,30 @@ void math_error(void __user *ip) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ + + err = swd & ~cwd; + + if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ + } else if (err & 0x004) { /* Divide by Zero */ info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ + } else if (err & 0x008) { /* Overflow */ info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; - break; + } else { + /* + * If we're using IRQ 13, or supposedly even some trap 16 + * implementations, it's possible we get a spurious trap... + */ + return; /* Spurious trap, no error */ } force_sig_info(SIGFPE, &info, task); } @@ -949,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init trap_init(void) { -#ifdef CONFIG_X86_32 int i; -#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -1008,11 +1003,15 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); +#endif /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) set_bit(i, used_vectors); +#ifdef CONFIG_X86_64 + set_bit(IA32_SYSCALL_VECTOR, used_vectors); +#else set_bit(SYSCALL_VECTOR, used_vectors); #endif /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 424093b157d..599e5816863 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -15,6 +15,7 @@ #include <asm/vgtod.h> #include <asm/time.h> #include <asm/delay.h> +#include <asm/hypervisor.h> unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -31,6 +32,7 @@ static int tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int tsc_disabled = -1; +static int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -98,6 +100,15 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + return 1; +} + +__setup("tsc=", tsc_setup); + #define MAX_RETRIES 5 #define SMI_TRESHOLD 50000 @@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms, fast_calibrate, tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; + tsc_khz = get_hypervisor_tsc_freq(); + if (tsc_khz) { + printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); + return tsc_khz; + } + local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { {} }; -/* - * Geode_LX - the OLPC CPU has a possibly a very reliable TSC - */ +static void __init check_system_tsc_reliable(void) +{ #ifdef CONFIG_MGEODE_LX -/* RTSC counts during suspend */ + /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 - -static void __init check_geode_tsc_reliable(void) -{ unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ if (res_low & RTSC_SUSP) - clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; -} -#else -static inline void check_geode_tsc_reliable(void) { } + tsc_clocksource_reliable = 1; #endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; +} /* * Make an educated guess if the TSC is trustworthy and synchronized @@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void) { clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); + if (tsc_clocksource_reliable) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) { clocksource_tsc.rating = 0; @@ -843,7 +859,7 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); - check_geode_tsc_reliable(); + check_system_tsc_reliable(); init_tsc_clocksource(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 1c0dfbca87c..bf36328f6ef 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -112,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + printk(KERN_INFO + "Skipping synchronization checks as TSC is reliable.\n"); + return; + } + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", smp_processor_id(), cpu); @@ -165,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc()) + if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) return; /* diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 8b6c393ab9f..23206ba1687 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -266,109 +266,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_DEBUG_PAGE_TYPE - -#ifdef CONFIG_X86_PAE -#define MAX_BOOT_PTS (2048+4+1) -#else -#define MAX_BOOT_PTS (1024+1) -#endif - -/* - * During boot, mem_map is not yet available in paging_init, so stash - * all the boot page allocations here. - */ -static struct { - u32 pfn; - int type; -} boot_page_allocations[MAX_BOOT_PTS]; -static int num_boot_page_allocations; -static int boot_allocations_applied; - -void vmi_apply_boot_page_allocations(void) -{ - int i; - BUG_ON(!mem_map); - for (i = 0; i < num_boot_page_allocations; i++) { - struct page *page = pfn_to_page(boot_page_allocations[i].pfn); - page->type = boot_page_allocations[i].type; - page->type = boot_page_allocations[i].type & - ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - } - boot_allocations_applied = 1; -} - -static void record_page_type(u32 pfn, int type) -{ - BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); - boot_page_allocations[num_boot_page_allocations].pfn = pfn; - boot_page_allocations[num_boot_page_allocations].type = type; - num_boot_page_allocations++; -} - -static void check_zeroed_page(u32 pfn, int type, struct page *page) -{ - u32 *ptr; - int i; - int limit = PAGE_SIZE / sizeof(int); - - if (page_address(page)) - ptr = (u32 *)page_address(page); - else - ptr = (u32 *)__va(pfn << PAGE_SHIFT); - /* - * When cloning the root in non-PAE mode, only the userspace - * pdes need to be zeroed. - */ - if (type & VMI_PAGE_CLONE) - limit = KERNEL_PGD_BOUNDARY; - for (i = 0; i < limit; i++) - BUG_ON(ptr[i]); -} - -/* - * We stash the page type into struct page so we can verify the page - * types are used properly. - */ -static void vmi_set_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - don't track */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - if (type != VMI_PAGE_NORMAL) - BUG_ON(page->type); - else - BUG_ON(page->type == VMI_PAGE_NORMAL); - page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (type & VMI_PAGE_ZEROED) - check_zeroed_page(pfn, type, page); - } else { - record_page_type(pfn, type); - } -} - -static void vmi_check_page_type(u32 pfn, int type) -{ - /* PAE can have multiple roots per page - skip checks */ - if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) - return; - - type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); - if (boot_allocations_applied) { - struct page *page = pfn_to_page(pfn); - BUG_ON((page->type ^ type) & VMI_PAGE_PAE); - BUG_ON(type == VMI_PAGE_NORMAL && page->type); - BUG_ON((type & page->type) == 0); - } -} -#else -#define vmi_set_page_type(p,t) do { } while (0) -#define vmi_check_page_type(p,t) do { } while (0) -#endif - #ifdef CONFIG_HIGHPTE static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { - vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } @@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) * It is called only for swapper_pg_dir, which already has * data on it. */ - vmi_set_page_type(pfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { - vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); - vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); - vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } /* @@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn) static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_set_pte(pte_t *ptep, pte_t pte) { /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); } static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } @@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE const pte_t pte = { .pte = pmdval.pmd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; - vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); #endif vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); } @@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); } @@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ const pte_t pte = { .pte = pudval.pgd.pgd }; - vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { const pte_t pte = { .pte = 0 }; - vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } #endif @@ -960,8 +841,6 @@ static inline int __init activate_vmi(void) void __init vmi_init(void) { - unsigned long flags; - if (!vmi_rom) probe_vmi_rom(); else @@ -973,13 +852,21 @@ void __init vmi_init(void) reserve_top_address(-vmi_rom->virtual_top); - local_irq_save(flags); - activate_vmi(); - #ifdef CONFIG_X86_IO_APIC /* This is virtual hardware; timer routing is wired correctly */ no_timer_check = 1; #endif +} + +void vmi_activate(void) +{ + unsigned long flags; + + if (!vmi_rom) + return; + + local_irq_save(flags); + activate_vmi(); local_irq_restore(flags & X86_EFLAGS_IF); } diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 254ee07f863..c4c1f9e0940 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) /* Upper bound is clockevent's use of ulong for cycle deltas. */ evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt); evt->min_delta_ns = clockevent_delta2ns(1, evt); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", evt->name, evt->mult, evt->shift); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc..82c67559dde 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -44,6 +44,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405..1a614c0e6be 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -35,6 +35,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) _etext = .; /* End of text section */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86..44153afc906 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,9 @@ * want per guest time just set the kernel.vsyscall64 sysctl to 0. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -128,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) gettimeofday(tv,NULL); return; } + + /* + * Surround the RDTSC by barriers, to make sure it's not + * speculated to outside the seqlock critical section and + * does not cause time warps: + */ + rdtsc_barrier(); now = vread(); + rdtsc_barrier(); + base = __vsyscall_gtod_data.clock.cycle_last; mask = __vsyscall_gtod_data.clock.mask; mult = __vsyscall_gtod_data.clock.mult; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 15c3e699918..2b54fe002e9 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf) * Restore the extended state if present. Otherwise, restore the FP/SSE * state. */ -int restore_user_xstate(void __user *buf) +static int restore_user_xstate(void __user *buf) { struct _fpx_sw_bytes fx_sw_user; u64 mask; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c02343594b4..d3ec292f00f 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif -ifeq ($(CONFIG_DMAR),y) -common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +ifeq ($(CONFIG_IOMMU_API),y) +common-objs += $(addprefix ../../../virt/kvm/, iommu.o) endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59ebd37ad79..e665d1c623c 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { + struct kvm_vcpu *vcpu; + int i; + mutex_lock(&kvm->lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); mutex_unlock(&kvm->lock); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + kvm_apic_nmi_wd_deliver(vcpu); + } } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 17e41e165f1..179dcb0103f 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,10 +26,40 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/bitops.h> #include "irq.h" #include <linux/kvm_host.h> +static void pic_lock(struct kvm_pic *s) +{ + spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) +{ + struct kvm *kvm = s->kvm; + unsigned acks = s->pending_acks; + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->pending_acks = 0; + s->wakeup_needed = false; + + spin_unlock(&s->lock); + + while (acks) { + kvm_notify_acked_irq(kvm, __ffs(acks)); + acks &= acks - 1; + } + + if (wakeup) { + vcpu = s->kvm->vcpus[0]; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { + pic_lock(s); pic_update_irq(s); + pic_unlock(s); } void kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); } + pic_unlock(s); } /* @@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + pic_unlock(s); kvm_notify_acked_irq(kvm, irq); return intno; @@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase; + int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; @@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s) for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (s->irr & (1 << irq) || s->isr & (1 << irq)) - kvm_notify_acked_irq(kvm, irq+irqbase); + if (s->irr & (1 << irq) || s->isr & (1 << irq)) { + n = irq + irqbase; + s->pics_state->pending_acks |= 1 << n; + } } s->last_irr = 0; s->irr = 0; @@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } + pic_unlock(s); } static void picdev_read(struct kvm_io_device *this, @@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; + pic_unlock(s); } /* @@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; + spin_lock_init(&s->lock); + s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; s->irq_request = pic_irq_request; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f17c8f5bbf3..2bf32a03cee 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -25,6 +25,7 @@ #include <linux/mm_types.h> #include <linux/hrtimer.h> #include <linux/kvm_host.h> +#include <linux/spinlock.h> #include "iodev.h" #include "ioapic.h" @@ -59,6 +60,10 @@ struct kvm_kpic_state { }; struct kvm_pic { + spinlock_t lock; + bool wakeup_needed; + unsigned pending_acks; + struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ irq_request_func *irq_request; void *irq_request_opaque; @@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 65ef0fc2c03..8e5ee99551f 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -7,7 +7,7 @@ #include <linux/kvm_host.h> #include <asm/msr.h> -#include "svm.h" +#include <asm/svm.h> static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fc3cab4894..afac68c0815 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; } +static inline int apic_lvt_nmi_mode(u32 lvt_val) +{ + return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_NMI: kvm_inject_nmi(vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: @@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } break; + case APIC_DM_EXTINT: + /* + * Should only be called by kvm_apic_local_deliver() with LVT0, + * before NMI watchdog was enabled. Already handled by + * kvm_apic_accept_pic_intr(). + */ + break; + default: printk(KERN_ERR "TODO: unsupported delivery mode %x\n", delivery_mode); @@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->timer.period))); } +static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) +{ + int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + + if (apic_lvt_nmi_mode(lvt0_val)) { + if (!nmi_wd_enabled) { + apic_debug("Receive NMI setting on APIC_LVT0 " + "for cpu %d\n", apic->vcpu->vcpu_id); + apic->vcpu->kvm->arch.vapics_in_nmi_mode++; + } + } else if (nmi_wd_enabled) + apic->vcpu->kvm->arch.vapics_in_nmi_mode--; +} + static void apic_mmio_write(struct kvm_io_device *this, gpa_t address, int len, const void *data) { @@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; + case APIC_LVT0: + apic_manage_nmi_watchdog(apic, val); case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: - case APIC_LVT0: case APIC_LVT1: case APIC_LVTERR: /* TODO: Check vector */ @@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -static int __inject_apic_timer_irq(struct kvm_lapic *apic) +static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +{ + u32 reg = apic_get_reg(apic, lvt_type); + int vector, mode, trig_mode; + + if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + vector = reg & APIC_VECTOR_MASK; + mode = reg & APIC_MODE_MASK; + trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; + return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + } + return 0; +} + +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) { - int vector; + struct kvm_lapic *apic = vcpu->arch.apic; - vector = apic_lvt_vector(apic, APIC_LVTT); - return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); + if (apic) + kvm_apic_local_deliver(apic, APIC_LVT0); } static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) @@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && apic_lvt_enabled(apic, APIC_LVTT) && - atomic_read(&apic->timer.pending) > 0) { - if (__inject_apic_timer_irq(apic)) + if (apic && atomic_read(&apic->timer.pending) > 0) { + if (kvm_apic_local_deliver(apic, APIC_LVTT)) atomic_dec(&apic->timer.pending); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1983d9477c..83f11c7474a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -17,7 +17,6 @@ * */ -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -33,6 +32,7 @@ #include <asm/page.h> #include <asm/cmpxchg.h> #include <asm/io.h> +#include <asm/vmx.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mt_mask; void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count += 1; } @@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count -= 1; WARN_ON(*write_count < 0); } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + struct kvm_memory_slot *slot; int *largepage_idx; + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot); return *largepage_idx; @@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; @@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) - kvm_flush_remote_tlbs(kvm); + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&sp->oos_link); ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; + sp->global = 1; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return sp; @@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte) struct kvm_mmu_page *sp = page_header(__pa(spte)); index = spte - sp->spt; - __set_bit(index, sp->unsync_child_bitmap); - sp->unsync_children = 1; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); } static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) @@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - sp->unsync_children = 1; kvm_mmu_update_parents_unsync(sp); return 1; } @@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + #define for_each_unsync_children(bitmap, idx) \ for (idx = find_first_bit(bitmap, 512); \ idx < 512; \ idx = find_next_bit(bitmap, 512, idx+1)) -static int mmu_unsync_walk(struct kvm_mmu_page *sp, - struct kvm_unsync_walk *walker) +int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) { - int i, ret; + int i; - if (!sp->unsync_children) - return 0; + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent)) { + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { struct kvm_mmu_page *child; child = page_header(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { - ret = mmu_unsync_walk(child, walker); - if (ret) + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else return ret; - __clear_bit(i, sp->unsync_child_bitmap); } if (child->unsync) { - ret = walker->entry(child, walker); - __clear_bit(i, sp->unsync_child_bitmap); - if (ret) - return ret; + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; } } } @@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) sp->unsync_children = 0; - return 0; + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); } static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) @@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + list_del(&sp->oos_link); + --kvm->stat.mmu_unsync_global; +} + static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; + if (sp->global) + kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1037,41 +1092,101 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } - rmap_write_protect(vcpu->kvm, sp->gfn); + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } kvm_mmu_flush_tlb(vcpu); - kvm_unlink_unsync_page(vcpu->kvm, sp); return 0; } -struct sync_walker { - struct kvm_vcpu *vcpu; - struct kvm_unsync_walk walker; +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; }; -static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, + int i) { - struct sync_walker *sync_walk = container_of(walk, struct sync_walker, - walker); - struct kvm_vcpu *vcpu = sync_walk->vcpu; + int n; + + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } - kvm_sync_page(vcpu, sp); - return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +void mmu_pages_clear_parents(struct mmu_page_path *parents) { - struct sync_walker walker = { - .walker = { .entry = mmu_sync_fn, }, - .vcpu = vcpu, - }; + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} - while (mmu_unsync_walk(sp, &walker.walker)) +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; + + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp); + mmu_pages_clear_parents(&parents); + } cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->role = role; hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) { - rmap_write_protect(vcpu->kvm, gfn); + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker, if (level == PT32E_ROOT_LEVEL) { shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; + if (!shadow_addr) + return 1; --level; } @@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -struct zap_walker { - struct kvm_unsync_walk walker; - struct kvm *kvm; - int zapped; -}; - -static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent) { - struct zap_walker *zap_walk = container_of(walk, struct zap_walker, - walker); - kvm_mmu_zap_page(zap_walk->kvm, sp); - zap_walk->zapped = 1; - return 0; -} + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; -static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct zap_walker walker = { - .walker = { .entry = mmu_zap_fn, }, - .kvm = kvm, - .zapped = 0, - }; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - mmu_unsync_walk(sp, &walker.walker); - return walker.zapped; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_zap_page(kvm, sp); + mmu_pages_clear_parents(&parents); + } + zapped += pages.nr; + kvm_mmu_pages_init(parent, &parents, &pages); + } + + return zapped; } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); struct kvm_mmu_page *sp = page_header(__pa(pte)); - __set_bit(slot, &sp->slot_bitmap); + __set_bit(slot, sp->slot_bitmap); } static void mmu_convert_notrap(struct kvm_mmu_page *sp) @@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} + static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; @@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } - kvm_mmu_mark_parents_unsync(vcpu, sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; + + if (sp->global) { + list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); + ++vcpu->kvm->stat.mmu_unsync_global; + } else + kvm_mmu_mark_parents_unsync(vcpu, sp); + mmu_convert_notrap(sp); return 0; } @@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative, + int global, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; + u64 mt_mask = shadow_mt_mask; + struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); + + if (!(vcpu->arch.cr4 & X86_CR4_PGE)) + global = 0; + if (!global && sp->global) { + sp->global = 0; + if (sp->unsync) { + kvm_unlink_unsync_global(vcpu->kvm, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } + } + /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; + if (mt_mask) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + spte |= mt_mask; + } spte |= (u64)pfn << PAGE_SHIFT; @@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_WRITABLE_MASK; + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writeble_pte(*shadow_pte)) + goto set_pte; + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); @@ -1495,8 +1746,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) + int *ptwrite, int largepage, int global, + gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + dirty, largepage, global, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 0, walk->write, 1, &walk->pt_write, - walk->largepage, gfn, walk->pfn, false); + walk->largepage, 0, gfn, walk->pfn, false); ++vcpu->stat.pf_fixed; return 1; } @@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } +static void mmu_sync_global(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *sp, *n; + + list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) + kvm_sync_page(vcpu, sp); +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_global(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; @@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - spin_lock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.invlpg(vcpu, gva); - spin_unlock(&vcpu->kvm->mmu_lock); kvm_mmu_flush_tlb(vcpu); ++vcpu->stat.invlpg; } @@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) int i; u64 *pt; - if (!test_bit(slot, &sp->slot_bitmap)) + if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; @@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->role.metaphysical) continue; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; if (*rmapp) printk(KERN_ERR "%s: (%s) shadow page has writable" diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 613ec9aa674..9fd78b6e17a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -82,6 +82,7 @@ struct shadow_walker { int *ptwrite; pfn_t pfn; u64 *sptep; + gpa_t pte_gpa; }; static gfn_t gpte_to_gfn(pt_element_t gpte) @@ -222,7 +223,7 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), + gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), pfn, true); } @@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, sw->user_fault, sw->write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, - false); + sw->ptwrite, sw->largepage, + gw->ptes[gw->level-1] & PT_GLOBAL_MASK, + gw->gfn, sw->pfn, false); sw->sptep = sptep; return 1; } @@ -331,6 +334,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); if (r || curr_pte != gw->ptes[level - 2]) { + kvm_mmu_put_page(shadow_page, sptep); kvm_release_pfn_clean(sw->pfn); sw->sptep = NULL; return 1; @@ -465,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, struct kvm_vcpu *vcpu, u64 addr, u64 *sptep, int level) { + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); - if (level == PT_PAGE_TABLE_LEVEL) { - if (is_shadow_present_pte(*sptep)) + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + sw->pte_gpa = (sp->gfn << PAGE_SHIFT); + sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + } set_shadow_pte(sptep, shadow_trap_nonpresent_pte); return 1; } @@ -479,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { + pt_element_t gpte; struct shadow_walker walker = { .walker = { .entry = FNAME(shadow_invlpg_entry), }, + .pte_gpa = -1, }; + spin_lock(&vcpu->kvm->mmu_lock); walk_shadow(&walker.walker, vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + if (walker.pte_gpa == -1) + return; + if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, + sizeof(pt_element_t))) + return; + if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, + sizeof(pt_element_t), 0); + } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -579,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, + is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c4ce657d96..1452851ae25 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,6 +28,8 @@ #include <asm/desc.h> +#include <asm/virtext.h> + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static int has_svm(void) { - uint32_t eax, ebx, ecx, edx; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - printk(KERN_INFO "has_svm: not amd\n"); - return 0; - } + const char *msg; - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { - printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + if (!cpu_has_svm(&msg)) { + printk(KERN_INFO "has_svn: %s\n", msg); return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { - printk(KERN_DEBUG "has_svm: svm not available\n"); - return 0; - } return 1; } static void svm_hardware_disable(void *garbage) { - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + cpu_svm_disable(); } static void svm_hardware_enable(void *garbage) @@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ + if (seg == VCPU_SREG_CS) + var->g = s->limit > 0xfffff; + + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ + if (seg == VCPU_SREG_TR) + var->type |= 0x2; + var->unusable = !var->present; } @@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) rep = (io_info & SVM_IOIO_REP_MASK) != 0; down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; + skip_emulated_instruction(&svm->vcpu); return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } @@ -1912,6 +1916,11 @@ static int get_npt_level(void) #endif } +static int svm_get_mt_mask_shift(void) +{ + return 0; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, + .get_mt_mask_shift = svm_get_mt_mask_shift, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d06b4dc0e2e..6259d746764 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -16,7 +16,6 @@ */ #include "irq.h" -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -31,6 +30,8 @@ #include <asm/io.h> #include <asm/desc.h> +#include <asm/vmx.h> +#include <asm/virtext.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -90,6 +91,11 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -122,7 +128,7 @@ static struct vmcs_config { u32 vmentry_ctrl; } vmcs_config; -struct vmx_capability { +static struct vmx_capability { u32 ept; u32 vpid; } vmx_capability; @@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + /* Otherwise falls through to kvm_set_msr_common */ default: vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); @@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu) static __init int cpu_has_kvm_support(void) { - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ + return cpu_has_vmx(); } static __init int vmx_disabled_by_bios(void) @@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void) __vcpu_clear(vmx); } -static void hardware_disable(void *garbage) + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) { - vmclear_local_vcpus(); asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } +static void hardware_disable(void *garbage) +{ + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = 0; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; - min = opt = 0; + min = 0; + opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) */ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { - u32 host_sysenter_cs; + u32 host_sysenter_cs, msr_low, msr_high; u32 junk; + u64 host_pat; unsigned long a; struct descriptor_table dt; int i; @@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_SYSENTER_EIP, a); vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; @@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.rmode.active = 0; + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; @@ -2335,6 +2374,29 @@ out: return ret; } +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) { + enable_irq_window(vcpu); + return; + } + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = NMI_VECTOR; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + NMI_VECTOR | INTR_TYPE_SOFT_INTR | + INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } +static void vmx_update_window_states(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + + vcpu->arch.nmi_window_open = + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + vcpu->arch.nmi_window_open = 0; + + vcpu->arch.interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS))); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) kvm_queue_interrupt(vcpu, irq); } - static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 cpu_based_vm_exec_control; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + vmx_update_window_states(vcpu); - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (vcpu->arch.irq_summary + || kvm_run->request_interrupt_window) + enable_irq_window(vcpu); + return; + } - if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + if (vcpu->arch.interrupt_window_open) { + if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) + kvm_do_inject_irq(vcpu); - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + } if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - /* - * Interrupts blocked. Wait for unblock. - */ - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - else - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + enable_irq_window(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; struct kvm_userspace_memory_region tss_mem = { - .slot = 8, + .slot = TSS_PRIVATE_MEMSLOT, .guest_phys_addr = addr, .memory_size = PAGE_SIZE * 3, .flags = 0, @@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; + skip_emulated_instruction(vcpu); return kvm_emulate_pio(vcpu, kvm_run, in, size, port); } @@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); KVMTRACE_0D(PEND_INTR, vcpu, handler); + ++vcpu->stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as @@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->arch.irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; int reason; @@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && + (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_NMI_INTR) { + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } tss_selector = exit_qualification; return kvm_task_switch(vcpu, tss_selector, reason); @@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - switch (err) { - case EMULATE_DONE: - break; - case EMULATE_DO_MMIO: - kvm_report_emulation_failure(vcpu, "mmio"); - /* TODO: Handle MMIO */ - return; - default: - kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + if (err == EMULATE_DO_MMIO) + break; + + if (err != EMULATE_DONE) { + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; } if (signal_pending(current)) @@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); - /* Guest state should be valid now, no more emulation should be needed */ - vmx->emulation_required = 0; + /* Guest state should be valid now except if we need to + * emulate an MMIO */ + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; } /* @@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + /* If we need to emulate an MMIO from handle_invalid_guest_state + * we just return 0 */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return 0; + /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ if (vm_need_ept() && is_paging(vcpu)) { @@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __func__, exit_reason); + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + printk(KERN_WARNING "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vcpu->arch.interrupt_window_open) { + vmx->soft_vnmi_blocked = 0; + vcpu->arch.nmi_window_open = 1; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.nmi_window_open = 1; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); } -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - if (!cpu_has_virtual_nmis()) - return; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return !(guest_intr & (GUEST_INTR_STATE_NMI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)); -} - -static int vmx_irq_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)) && - (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); -} - -static void enable_intr_window(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); -} - static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (unblock_nmi && vector != DF_VECTOR) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - } + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); idt_vectoring_info = vmx->idt_vectoring_info; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3147,24 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) { update_tpr_threshold(vcpu); - if (cpu_has_virtual_nmis()) { - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vmx_nmi_enabled(vcpu)) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_intr_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - enable_intr_window(vcpu); + vmx_update_window_states(vcpu); + + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); return; } } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); + return; + } if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vmx_irq_enabled(vcpu)) + if (vcpu->arch.interrupt_window_open) kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); else enable_irq_window(vcpu); @@ -3172,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) if (vcpu->arch.interrupt.pending) { vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); } } @@ -3211,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) + vmx->entry_time = ktime_get(); + /* Handle invalid guest state instead of entering VMX */ if (vmx->emulation_required && emulate_invalid_guest_state) { handle_invalid_guest_state(vcpu, kvm_run); @@ -3325,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; + vmx_update_window_states(vcpu); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -3335,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (intr_info & INTR_INFO_VALID_MASK)) { KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); @@ -3453,6 +3571,11 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } +static int vmx_get_mt_mask_shift(void) +{ + return VMX_EPT_MT_EPTE_SHIFT; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3508,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, + .get_mt_mask_shift = vmx_get_mt_mask_shift, }; static int __init vmx_init(void) @@ -3564,10 +3688,10 @@ static int __init vmx_init(void) bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT | VMX_EPT_IGMT_BIT); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + VMX_EPT_EXECUTABLE_MASK, + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); kvm_enable_tdp(); } else kvm_disable_tdp(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1f8ff2f1fa..cc17546a240 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -34,11 +34,13 @@ #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/iommu.h> #include <linux/intel-iommu.h> #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, + { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, + { "nmi_injections", VCPU_STAT(nmi_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -449,7 +456,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT }; static unsigned num_msrs_to_save; @@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr) static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - vcpu->arch.mtrr[msr - 0x200] = data; + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + kvm_mmu_reset_context(vcpu); return 0; } @@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - *pdata = vcpu->arch.mtrr[msr - 0x200]; + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + return 0; } @@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_CLOCKSOURCE: @@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = !tdp_enabled; break; case KVM_CAP_IOMMU: - r = intel_iommu_found(); + r = iommu_found(); break; default: r = 0; @@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int t, times = entry->eax & 0xff; entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times && *nent < maxnent; ++t) { do_cpuid_1_ent(&entry[t], function, 0); entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; @@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff; + level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; do_cpuid_1_ent(&entry[i], function, i); @@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_inject_nmi(vcpu); + vcpu_put(vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_NMI: { + r = kvm_vcpu_ioctl_nmi(vcpu); + if (r) + goto out; + r = 0; + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); return 1; } @@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); @@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); return 0; out: @@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { + for (j = i + 1; ; j = (j + 1) % nent) { struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; @@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->padding = 0; } -static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) { if (selector & 1 << 2) { struct kvm_segment kvm_seg; @@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); @@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) return 1; @@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) /* We do fxsave: this must be aligned. */ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); if (r == 0) @@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = false; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -4012,6 +4092,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_iommu_unmap_guest(kvm); kvm_free_all_assigned_devices(kvm); + kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending; } static void vcpu_kick_intr(void *info) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ea051173b0d..d174db7a337 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -58,6 +58,7 @@ #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ #define SrcMask (7<<4) /* Generic ModRM decode. */ #define ModRM (1<<7) @@ -70,17 +71,23 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Mask (7<<29) enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; -static u16 opcode_table[256] = { +static u32 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -195,7 +202,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; -static u16 twobyte_table[256] = { +static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, @@ -230,9 +237,14 @@ static u16 twobyte_table[256] = { /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, /* 0xB0 - 0xB7 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, DstMem | SrcReg | ModRM | BitOp, @@ -253,7 +265,7 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static u16 group_table[] = { +static u32 group_table[] = { [Group1_80*8] = ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, @@ -297,9 +309,9 @@ static u16 group_table[] = { SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, }; -static u16 group2_table[] = { +static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, 0, + SrcNone | ModRM, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, }; @@ -359,49 +371,48 @@ static u16 group2_table[] = { "andl %"_msk",%"_LO32 _tmp"; " \ "orl %"_LO32 _tmp",%"_sav"; " +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + /* Raw emulation: instruction has two explicit operands. */ #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + break; \ + } \ } while (0) #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ - unsigned long __tmp; \ + unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (__tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK)); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -425,71 +436,68 @@ static u16 group2_table[] = { __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ "w", "r", _LO32, "r", "", "r") -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"b %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"w %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"l %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK)); \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"q %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ } while (0) -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ @@ -1041,6 +1049,33 @@ done_prefixes: c->src.bytes = 1; c->src.val = insn_fetch(s8, 1, c->eip); break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->regs[VCPU_REGS_RSP]); } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - &c->dst.val, c->dst.bytes, ctxt->vcpu); + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + &c->src.val, c->src.bytes, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + c->src.bytes = c->dst.bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) + return rc; + c->dst.val = c->src.val; return 0; } @@ -1415,24 +1463,15 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - -c->op_bytes); - c->dst.ptr = (void *) register_address( - c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); + emulate_push(ctxt); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), c->dst.ptr, - c->op_bytes, ctxt->vcpu)) != 0) + c->src.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) goto done; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - c->op_bytes); - c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.val = c->src.val; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -1591,7 +1630,9 @@ special_insn: emulate_push(ctxt); break; case 0x9d: /* popf */ + c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; @@ -1689,7 +1730,9 @@ special_insn: emulate_grp2(ctxt); break; case 0xc3: /* ret */ + c->dst.type = OP_REG; c->dst.ptr = &c->eip; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ mov: @@ -1778,7 +1821,7 @@ special_insn: c->eip = saved_eip; goto cannot_emulate; } - return 0; + break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; @@ -1999,12 +2042,20 @@ twobyte_insn: c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xab: bts: /* bts */ /* only subword offset */ c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xae: /* clflush */ break; case 0xb0 ... 0xb1: /* cmpxchg */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a5d8e1ace1c..a7ed208f81e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -590,7 +590,8 @@ static void __init lguest_init_IRQ(void) * a straightforward 1 to 1 mapping, so force that here. */ __get_cpu_var(vector_irq)[vector] = i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[vector]); + set_intr_gate(vector, + interrupt[vector-FIRST_EXTERNAL_VECTOR]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); @@ -737,7 +738,7 @@ static void lguest_time_init(void) /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ - lguest_clockevent.cpumask = cpumask_of_cpu(0); + lguest_clockevent.cpumask = cpumask_of(0); clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 5c7cef34c9e..10b9bd35a8f 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -30,21 +30,6 @@ ENTRY(lguest_entry) movl $lguest_data - __PAGE_OFFSET, %edx int $LGUEST_TRAP_ENTRY - /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl - * instruction uses %esi implicitly as the source for the copy we're - * about to do. */ - movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi - - /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. - * This means the first 128M of kernel memory will be mapped at - * PAGE_OFFSET where the kernel expects to run. This will get it far - * enough through boot to switch to its own pagetables. */ - movl $32, %ecx - movl %esi, %edi - addl $((__PAGE_OFFSET >> 22) * 4), %edi - rep - movsl - /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 37b9ae4d44c..df167f26562 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -133,29 +133,28 @@ void __init time_init_hook(void) **/ void mca_nmi_hook(void) { - /* If I recall correctly, there's a whole bunch of other things that + /* + * If I recall correctly, there's a whole bunch of other things that * we can do to check for NMI problems, but that's all I know about * at the moment. */ - - printk("NMI generated from unknown source!\n"); + pr_warning("NMI generated from unknown source!\n"); } #endif static __init int no_ipi_broadcast(char *str) { get_option(&str, &no_broadcast); - printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : - "IPI Broadcast"); + pr_info("Using %s mode\n", + no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); return 1; } - __setup("no_ipi_broadcast=", no_ipi_broadcast); static int __init print_ipi_mode(void) { - printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : - "Shortcut"); + pr_info("Using IPI %s mode\n", + no_broadcast ? "No-Shortcut" : "Shortcut"); return 0; } diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3c3b471ea49..bc4c7840b2a 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -17,6 +17,7 @@ #include <asm/bigsmp/apic.h> #include <asm/bigsmp/ipi.h> #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> static int dmi_bigsmp; /* can be set by dmi scanners */ @@ -41,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { - return cpumask_of_cpu(cpu); + cpus_clear(*retmask); + cpu_set(cpu, *retmask); } static int probe_bigsmp(void) diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 9e835a11a13..e63a4a76d8c 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -16,6 +16,7 @@ #include <asm/mach-default/mach_apic.h> #include <asm/mach-default/mach_ipi.h> #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> /* should be called last. */ static int probe_default(void) diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3dd..4ba5ccaa158 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@ #include <asm/es7000/apic.h> #include <asm/es7000/ipi.h> #include <asm/es7000/mpparse.h> -#include <asm/es7000/wakecpu.h> +#include <asm/mach-default/mach_wakecpu.h> + +void __init es7000_update_genapic_to_cluster(void) +{ + genapic->target_cpus = target_cpus_cluster; + genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; + genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; + genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + + genapic->init_apic_ldr = init_apic_ldr_cluster; + + genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +} static int probe_es7000(void) { @@ -75,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -85,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 71a309b122e..511d7941364 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 5a7e4619e1c..c346d9d0226 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -15,6 +15,7 @@ #include <asm/mpspec.h> #include <asm/apicdef.h> #include <asm/genapic.h> +#include <asm/setup.h> extern struct genapic apic_numaq; extern struct genapic apic_summit; @@ -57,6 +58,9 @@ static int __init parse_apic(char *arg) } } + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); + /* Parsed again by __setup for debug/verbose */ return 0; } @@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void) * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ - if (!cmdline_apic && genapic == &apic_default) + if (!cmdline_apic && genapic == &apic_default) { if (apic_bigsmp.probe()) { genapic = &apic_bigsmp; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Overriding APIC driver with %s\n", genapic->name); } + } #endif } @@ -94,6 +101,9 @@ void __init generic_apic_probe(void) /* Not visible without early console */ if (!apic_probe[i]) panic("Didn't find an APIC driver"); + + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); } printk(KERN_INFO "Using APIC driver %s\n", genapic->name); } @@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem, if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } @@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { if (!cmdline_apic) { genapic = apic_probe[i]; + if (x86_quirks->update_genapic) + x86_quirks->update_genapic(); printk(KERN_INFO "Switched to APIC driver `%s'.\n", genapic->name); } diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6272b5e69da..2821ffc188b 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -16,6 +16,7 @@ #include <asm/summit/apic.h> #include <asm/summit/ipi.h> #include <asm/summit/mpparse.h> +#include <asm/mach-default/mach_wakecpu.h> static int probe_summit(void) { @@ -23,7 +24,7 @@ static int probe_summit(void) return 0; } -static cpumask_t vector_allocation_domain(int cpu) +static void vector_allocation_domain(int cpu, cpumask_t *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -33,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; - return domain; + *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; } struct genapic apic_summit = APIC_INIT("summit", probe_summit); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 52145007bd7..9840b7ec749 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1; /* Used for the invalidate map that's also checked in the spinlock */ static volatile unsigned long smp_invalidate_needed; -/* Bitmask of currently online CPUs - used by setup.c for - /proc/cpuinfo, visible externally but still physical */ -cpumask_t cpu_online_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_online_map); - /* Bitmask of CPUs present in the system - exported by i386_syms.c, used * by scheduler but indexed physically */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; @@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* This is for the new dynamic CPU boot code */ cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; -cpumask_t cpu_possible_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_possible_map); /* The per processor IRQ masks (these are usually kept in sync) */ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; @@ -364,9 +357,8 @@ void __init find_smp_config(void) printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); /* initialize the CPU structures (moved from smp_boot_cpus) */ - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) cpu_irq_affinity[i] = ~0; - } cpu_online_map = cpumask_of_cpu(boot_cpu_id); /* The boot CPU must be extended */ @@ -679,7 +671,7 @@ void __init smp_boot_cpus(void) /* loop over all the extended VIC CPUs and boot them. The * Quad CPUs must be bootstrapped by their extended VIC cpu */ - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) continue; do_boot_cpu(i); @@ -1234,7 +1226,7 @@ int setup_profiling_timer(unsigned int multiplier) * new values until the next timer interrupt in which they do process * accounting. */ - for (i = 0; i < NR_CPUS; ++i) + for (i = 0; i < nr_cpu_ids; ++i) per_cpu(prof_multiplier, i) = multiplier; return 0; @@ -1264,7 +1256,7 @@ void __init voyager_smp_intr_init(void) int i; /* initialize the per cpu irq mask to all disabled */ - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < nr_cpu_ids; i++) vic_irq_mask[i] = 0xFFFF; VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fea4565ff57..d8cc96a2738 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o obj-$(CONFIG_MMIOTRACE) += mmiotrace.o -mmiotrace-y := pf_in.o mmio-mod.o +mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa24..57ec8c86a87 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -53,7 +53,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; @@ -393,7 +393,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, if (pte && pte_present(*pte) && !pte_exec(*pte)) printk(KERN_CRIT "kernel tried to execute " "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); + "(uid: %d)\n", current_uid()); } #endif @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { unsigned long flags = oops_begin(); + int sig = SIGKILL; struct task_struct *tsk; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; if (__die("Bad pagetable", regs, error_code)) - regs = NULL; - oops_end(flags, regs, SIGKILL); + sig = 0; + oops_end(flags, regs, sig); } #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) int fault; #ifdef CONFIG_X86_64 unsigned long flags; + int sig; #endif tsk = current; @@ -849,11 +851,12 @@ no_context: bust_spinlocks(0); do_exit(SIGKILL); #else + sig = SIGKILL; if (__die("Oops", regs, error_code)) - regs = NULL; + sig = 0; /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags, regs, SIGKILL); + oops_end(flags, regs, sig); #endif /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index c483f424207..f99a6c6c432 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/pci.h> #include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> @@ -67,7 +68,7 @@ static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; -static __init void *alloc_low_page(unsigned long *phys) +static __init void *alloc_low_page(void) { unsigned long pfn = table_end++; void *adr; @@ -77,7 +78,6 @@ static __init void *alloc_low_page(unsigned long *phys) adr = __va(pfn * PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *phys = pfn * PAGE_SIZE; return adr; } @@ -92,16 +92,17 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_t *pmd_table; #ifdef CONFIG_X86_PAE - unsigned long phys; if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_init_bootmem) pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); else - pmd_table = (pmd_t *)alloc_low_page(&phys); + pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); + + return pmd_table; } #endif pud = pud_offset(pgd, 0); @@ -126,10 +127,8 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!page_table) page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); - } else { - unsigned long phys; - page_table = (pte_t *)alloc_low_page(&phys); - } + } else + page_table = (pte_t *)alloc_low_page(); paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -436,8 +435,12 @@ static void __init set_highmem_pages_init(void) #endif /* !CONFIG_NUMA */ #else -# define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init() do { } while (0) +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +static inline void set_highmem_pages_init(void) +{ +} #endif /* CONFIG_HIGHMEM */ void __init native_pagetable_setup_start(pgd_t *base) @@ -969,7 +972,7 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; - start_periodic_check_for_corruption(); + pci_iommu_alloc(); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); @@ -1040,11 +1043,25 @@ void __init mem_init(void) (unsigned long)&_text, (unsigned long)&_etext, ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + /* + * Check boundaries twice: Some fundamental inconsistencies can + * be detected at build time already. + */ +#define __FIXADDR_TOP (-PAGE_SIZE) +#ifdef CONFIG_HIGHMEM + BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE); +#endif +#define high_memory (-128UL << 20) + BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END); +#undef high_memory +#undef __FIXADDR_TOP + #ifdef CONFIG_HIGHMEM BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON(VMALLOC_START >= VMALLOC_END); BUG_ON((unsigned long)high_memory > VMALLOC_START); if (boot_cpu_data.wp_works_ok < 0) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9db01db6e3c..9f7a0d24d42 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -902,8 +902,6 @@ void __init mem_init(void) long codesize, reservedpages, datasize, initsize; unsigned long absent_pages; - start_periodic_check_for_corruption(); - pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4c4307ff3e..bd85d42819e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -223,7 +223,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ON(iomem_map_sanity_check(phys_addr, size)); + WARN_ONCE(iomem_map_sanity_check(phys_addr, size), + KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); /* * Don't allow anybody to remap normal RAM that we're using.. diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index cebcbf152d4..71a14f89f89 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -278,7 +278,7 @@ void __init numa_init_array(void) int rr, i; rr = first_node(node_online_map); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); @@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) memnodemap[0] = 0; node_set_online(0); node_set(0, node_possible_map); - for (i = 0; i < NR_CPUS; i++) + for (i = 0; i < nr_cpu_ids; i++) numa_set_node(i, 0); e820_register_active_regions(0, start_pfn, last_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index eb1bf000d12..85cbd3cd372 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -596,6 +596,242 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } +/* + * Internal interface to reserve a range of physical memory with prot. + * Reserved non RAM regions only and after successful reserve_memtype, + * this func also keeps identity mapping (if any) in sync with this new prot. + */ +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) +{ + int is_ram = 0; + int id_sz, ret; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + is_ram = pagerange_is_ram(paddr, paddr + size); + + if (is_ram != 0) { + /* + * For mapping RAM pages, drivers need to call + * set_memory_[uc|wc|wb] directly, for reserve and free, before + * setting up the PTE. + */ + WARN_ON_ONCE(1); + return 0; + } + + ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); + if (ret) + return ret; + + if (flags != want_flags) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR + "%s:%d reserve_pfn_range ioremap_change_attr failed %s " + "for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size)); + return -EINVAL; + } + return 0; +} + +/* + * Internal interface to free a range of physical memory. + * Frees non RAM regions only. + */ +static void free_pfn_range(u64 paddr, unsigned long size) +{ + int is_ram; + + is_ram = pagerange_is_ram(paddr, paddr + size); + if (is_ram == 0) + free_memtype(paddr, paddr + size); +} + +/* + * track_pfn_vma_copy is called when vma that is covering the pfnmap gets + * copied through copy_page_range(). + * + * If the vma has a linear pfn mapping for the entire range, we get the prot + * from pte and reserve the entire vma range with single reserve_pfn_range call. + * Otherwise, we reserve the entire vma range, my ging through the PTEs page + * by page to get physical address and protection. + */ +int track_pfn_vma_copy(struct vm_area_struct *vma) +{ + int retval = 0; + unsigned long i, j; + resource_size_t paddr; + unsigned long prot; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* + * reserve the whole chunk covered by vma. We need the + * starting address and protection from pte. + */ + if (follow_phys(vma, vma_start, 0, &prot, &paddr)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); + } + + /* reserve entire vma page by page, using pfn and prot from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) + continue; + + retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + if (follow_phys(vma, vma_start + j, 0, &prot, &paddr)) + continue; + + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * track_pfn_vma_new is called when a _new_ pfn mapping is being established + * for physical range indicated by pfn and size. + * + * prot is passed in as a parameter for the new mapping. If the vma has a + * linear pfn mapping for the entire range reserve the entire vma range with + * single reserve_pfn_range call. + * Otherwise, we look t the pfn and size and reserve only the specified range + * page by page. + * + * Note that this function can be called with caller trying to map only a + * subrange/page inside the vma. + */ +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, + unsigned long pfn, unsigned long size) +{ + int retval = 0; + unsigned long i, j; + resource_size_t base_paddr; + resource_size_t paddr; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return 0; + + if (is_linear_pfn_mapping(vma)) { + /* reserve the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + return reserve_pfn_range(paddr, vma_size, prot); + } + + /* reserve page by page using pfn and size */ + base_paddr = (resource_size_t)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = base_paddr + i; + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + if (retval) + goto cleanup_ret; + } + return 0; + +cleanup_ret: + /* Reserve error: Cleanup partial reservation and return error */ + for (j = 0; j < i; j += PAGE_SIZE) { + paddr = base_paddr + j; + free_pfn_range(paddr, PAGE_SIZE); + } + + return retval; +} + +/* + * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack can be called for a specific region indicated by pfn and size or + * can be for the entire vma (in which case size can be zero). + */ +void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, + unsigned long size) +{ + unsigned long i; + resource_size_t paddr; + unsigned long prot; + unsigned long vma_start = vma->vm_start; + unsigned long vma_end = vma->vm_end; + unsigned long vma_size = vma_end - vma_start; + + if (!pat_enabled) + return; + + if (is_linear_pfn_mapping(vma)) { + /* free the whole chunk starting from vm_pgoff */ + paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; + free_pfn_range(paddr, vma_size); + return; + } + + if (size != 0 && size != vma_size) { + /* free page by page, using pfn and size */ + paddr = (resource_size_t)pfn << PAGE_SHIFT; + for (i = 0; i < size; i += PAGE_SIZE) { + paddr = paddr + i; + free_pfn_range(paddr, PAGE_SIZE); + } + } else { + /* free entire vma, page by page, using the pfn from pte */ + for (i = 0; i < vma_size; i += PAGE_SIZE) { + if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) + continue; + + free_pfn_range(paddr, PAGE_SIZE); + } + } +} + +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + if (pat_enabled) + return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC); + else + return pgprot_noncached(prot); +} + #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 51c0a2fc14f..09737c8af07 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (!node_online(i)) setup_node_bootmem(i, nodes[i].start, nodes[i].end); - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { int node = early_cpu_to_node(i); if (node == NUMA_NO_NODE) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 022cd41ea9b..202864ad49a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -401,14 +401,13 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/pii"; break; case 6 ... 8: + case 10 ... 11: *cpu_type = "i386/piii"; break; case 9: + case 13: *cpu_type = "i386/p6_mobile"; break; - case 10 ... 13: - *cpu_type = "i386/p6"; - break; case 14: *cpu_type = "i386/core"; break; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 509513760a6..98658f25f54 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -65,11 +65,13 @@ static unsigned long reset_value[NUM_COUNTERS]; #define IBS_FETCH_BEGIN 3 #define IBS_OP_BEGIN 4 -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ +/* + * The function interface needs to be fixed, something like add + * data. Should then be added to linux/oprofile.h. + */ extern void -oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int *const ibs_sample, int ibs_code); +oprofile_add_ibs_sample(struct pt_regs * const regs, + unsigned int * const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ @@ -104,11 +106,6 @@ struct ibs_op_sample { unsigned int ibs_dc_phys_high; }; -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ -static void clear_ibs_nmi(void); - static int ibs_allowed; /* AMD Family10h and later */ struct op_ibs_config { @@ -223,7 +220,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_fetch, IBS_FETCH_BEGIN); - /*reenable the IRQ */ + /* reenable the IRQ */ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); high &= ~IBS_FETCH_HIGH_VALID_BIT; high |= IBS_FETCH_HIGH_ENABLE; @@ -331,8 +328,10 @@ static void op_amd_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ + /* + * Subtle: stop on all counters to avoid race with setting our + * pm callback + */ for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; @@ -343,13 +342,15 @@ static void op_amd_stop(struct op_msrs const * const msrs) #ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } @@ -370,18 +371,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else +#ifdef CONFIG_OPROFILE_IBS static u8 ibs_eilvt_off; @@ -395,7 +385,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } -static int pfm_amd64_setup_eilvt(void) +static int init_ibs_nmi(void) { #define IBSCTL_LVTOFFSETVAL (1 << 8) #define IBSCTL 0x1cc @@ -443,18 +433,22 @@ static int pfm_amd64_setup_eilvt(void) return 0; } -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ -static void setup_ibs(void) +/* uninitialize the APIC for the IBS interrupts if needed */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +/* initialize the APIC for the IBS interrupts if available */ +static void ibs_init(void) { ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); if (!ibs_allowed) return; - if (pfm_amd64_setup_eilvt()) { + if (init_ibs_nmi()) { ibs_allowed = 0; return; } @@ -462,14 +456,12 @@ static void setup_ibs(void) printk(KERN_INFO "oprofile: AMD IBS detected\n"); } - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ -static void clear_ibs_nmi(void) +static void ibs_exit(void) { - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + if (!ibs_allowed) + return; + + clear_ibs_nmi(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -519,7 +511,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) static int op_amd_init(struct oprofile_operations *ops) { - setup_ibs(); + ibs_init(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; return 0; @@ -527,10 +519,21 @@ static int op_amd_init(struct oprofile_operations *ops) static void op_amd_exit(void) { - clear_ibs_nmi(); + ibs_exit(); } -#endif +#else + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 716d26f0e5d..e9f80c744cf 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -156,6 +156,8 @@ static void ppro_start(struct op_msrs const * const msrs) unsigned int low, high; int i; + if (!reset_value) + return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(low, high, msrs, i); @@ -171,6 +173,8 @@ static void ppro_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; + if (!reset_value) + return; for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1d88d2b3977..9e5752fe4d1 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -4,7 +4,7 @@ #include <linux/irq.h> #include <linux/dmi.h> #include <asm/numa.h> -#include "pci.h" +#include <asm/pci_x86.h> struct pci_root_info { char *name; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 22e057665e5..9bb09823b36 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <linux/topology.h> #include <linux/cpu.h> -#include "pci.h" +#include <asm/pci_x86.h> #ifdef CONFIG_X86_64 #include <asm/pci-direct.h> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index b67732bbb85..62ddb73e09e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -14,8 +14,7 @@ #include <asm/segment.h> #include <asm/io.h> #include <asm/smp.h> - -#include "pci.h" +#include <asm/pci_x86.h> unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; @@ -23,6 +22,12 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | unsigned int pci_early_dump_regs; static int pci_bf_sort; int pci_routeirq; +int noioapicquirk; +#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS +int noioapicreroute = 0; +#else +int noioapicreroute = 1; +#endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; @@ -519,6 +524,17 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "skip_isa_align")) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; return NULL; + } else if (!strcmp(str, "noioapicquirk")) { + noioapicquirk = 1; + return NULL; + } else if (!strcmp(str, "ioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 0; + return NULL; + } else if (!strcmp(str, "noioapicreroute")) { + if (noioapicreroute != -1) + noioapicreroute = 1; + return NULL; } return str; } diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9915293500f..bd13c3e4c6d 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -5,7 +5,7 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/dmi.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * Functions for accessing PCI base (first 256 bytes) and extended @@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -static struct pci_raw_ops pci_direct_conf2 = { +struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -289,6 +289,7 @@ int __init pci_direct_probe(void) if (pci_check_type1()) { raw_pci_ops = &pci_direct_conf1; + port_cf9_safe = true; return 1; } release_resource(region); @@ -305,6 +306,7 @@ int __init pci_direct_probe(void) if (pci_check_type2()) { raw_pci_ops = &pci_direct_conf2; + port_cf9_safe = true; return 2; } diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index 86631ccbc25..f6adf2c6d75 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <asm/pci-direct.h> #include <asm/io.h> -#include "pci.h" +#include <asm/pci_x86.h> /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 2051dc96b8e..7d388d5cf54 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,8 +6,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/init.h> -#include "pci.h" - +#include <asm/pci_x86.h> static void __devinit pci_fixup_i450nx(struct pci_dev *d) { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 844df0cbbd3..e51bf2cda4b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -34,8 +34,8 @@ #include <asm/pat.h> #include <asm/e820.h> +#include <asm/pci_x86.h> -#include "pci.h" static int skip_isa_ioresource_align(struct pci_dev *dev) { diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index d6c950f8185..bec3b048e72 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,6 @@ #include <linux/pci.h> #include <linux/init.h> -#include "pci.h" +#include <asm/pci_x86.h> /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index bf69dbe08bf..373b9afe6d4 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -16,8 +16,7 @@ #include <asm/io_apic.h> #include <linux/irq.h> #include <linux/acpi.h> - -#include "pci.h" +#include <asm/pci_x86.h> #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index b722dd481b3..f1065b129e9 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -3,7 +3,7 @@ */ #include <linux/init.h> #include <linux/pci.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * Discover remaining PCI buses in case there are peer host bridges. diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 654a2234f8f..89bf9242c80 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -15,8 +15,7 @@ #include <linux/acpi.h> #include <linux/bitmap.h> #include <asm/e820.h> - -#include "pci.h" +#include <asm/pci_x86.h> /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index f3c761dce69..8b2d561046a 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -13,7 +13,7 @@ #include <linux/init.h> #include <linux/acpi.h> #include <asm/e820.h> -#include "pci.h" +#include <asm/pci_x86.h> /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index a1994163c99..30007ffc8e1 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -10,8 +10,7 @@ #include <linux/acpi.h> #include <linux/bitmap.h> #include <asm/e820.h> - -#include "pci.h" +#include <asm/pci_x86.h> /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 1177845d318..2089354968a 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -7,7 +7,7 @@ #include <linux/nodemask.h> #include <mach_apic.h> #include <asm/mpspec.h> -#include "pci.h" +#include <asm/pci_x86.h> #define XQUAD_PORTIO_BASE 0xfe400000 #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index e11e9e803d5..b889d824f7c 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -29,7 +29,7 @@ #include <linux/init.h> #include <asm/olpc.h> #include <asm/geode.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * In the tables below, the first two line (8 longwords) are the diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 37472fc6f72..b82cae970df 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -6,9 +6,8 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/uaccess.h> -#include "pci.h" -#include "pci-functions.h" - +#include <asm/pci_x86.h> +#include <asm/mach-default/pci-functions.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 42f4cb19fac..16d0c0eb0d1 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -9,11 +9,10 @@ #include <linux/init.h> #include <asm/setup.h> +#include <asm/pci_x86.h> #include <asm/visws/cobalt.h> #include <asm/visws/lithium.h> -#include "pci.h" - static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } static void pci_visws_disable_irq(struct pci_dev *dev) { } diff --git a/arch/x86/scripts/strip-symbols b/arch/x86/scripts/strip-symbols new file mode 100644 index 00000000000..a2f1ccb827c --- /dev/null +++ b/arch/x86/scripts/strip-symbols @@ -0,0 +1 @@ +__cpu_vendor_dev_X86_VENDOR_* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 1ef0f90813d..d9d35824c56 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -9,6 +9,9 @@ * Also alternative() doesn't work. */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING + #include <linux/kernel.h> #include <linux/posix-timers.h> #include <linux/time.h> diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 513f330c583..1241f118ab5 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -310,7 +310,7 @@ int __init sysenter_setup(void) } /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 257ba4a10ab..9c98cc6ba97 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; unsigned long addr; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5e4686d70f6..bea215230b2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include <linux/console.h> #include <xen/interface/xen.h> +#include <xen/interface/version.h> #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> #include <xen/features.h> @@ -793,7 +794,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = 0; - switch(msr) { + switch (msr) { #ifdef CONFIG_X86_64 unsigned which; u64 base; @@ -1453,7 +1454,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) ident_pte = 0; pfn = 0; - for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { pte_t *pte_page; /* Reuse or allocate a page of ptes */ @@ -1471,7 +1472,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } /* Install mappings */ - for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; if (pfn > max_pfn_mapped) @@ -1485,7 +1486,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) } } - for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); set_page_prot(pmd, PAGE_KERNEL_RO); @@ -1499,7 +1500,7 @@ static void convert_pfn_mfn(void *v) /* All levels are converted the same way, so just treat them as ptes. */ - for(i = 0; i < PTRS_PER_PTE; i++) + for (i = 0; i < PTRS_PER_PTE; i++) pte[i] = xen_make_pte(pte[i].pte); } @@ -1514,7 +1515,8 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1577,7 +1579,8 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf #else /* !CONFIG_X86_64 */ static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) { pmd_t *kernel_pmd; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 636ef4caa52..503c240e26c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -154,13 +154,13 @@ void xen_setup_mfn_list_list(void) { unsigned pfn, idx; - for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -179,7 +179,7 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top[topidx] = &mfn_list[pfn]; @@ -207,7 +207,7 @@ static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); BUG_ON(p == NULL); - for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; if (cmpxchg(pp, p2m_missing, p) != p2m_missing) @@ -407,7 +407,8 @@ out: preempt_enable(); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ return *ptep; @@ -878,7 +879,8 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) if (user_pgd) { xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ @@ -993,7 +995,8 @@ static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) pgd_t *user_pgd = xen_get_user_pgd(pgd); if (user_pgd) { - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + xen_do_pin(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(user_pgd))); xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } @@ -1079,7 +1082,7 @@ static void drop_other_mm_ref(void *info) static void xen_drop_mm_ref(struct mm_struct *mm) { - cpumask_t mask; + cpumask_var_t mask; unsigned cpu; if (current->active_mm == mm) { @@ -1091,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm) } /* Get the "official" set of cpus referring to our pagetable. */ - mask = mm->cpu_vm_mask; + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { + for_each_online_cpu(cpu) { + if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) + continue; + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); + } + return; + } + cpumask_copy(mask, &mm->cpu_vm_mask); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed @@ -1100,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm) if needed. */ for_each_online_cpu(cpu) { if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) - cpu_set(cpu, mask); + cpumask_set_cpu(cpu, mask); } - if (!cpus_empty(mask)) - smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); + if (!cpumask_empty(mask)) + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); + free_cpumask_var(mask); } #else static void xen_drop_mm_ref(struct mm_struct *mm) diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8ea8a0d0b0d..c738644b543 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -154,7 +154,7 @@ void xen_mc_flush(void) ret, smp_processor_id()); dump_stack(); for (i = 0; i < b->mcidx; i++) { - printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, b->debug[i].op, b->debug[i].args[0], diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d6790108388..15c6c68db6a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -28,6 +28,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +extern void xen_sysenter_target(void); +extern void xen_syscall_target(void); +extern void xen_syscall32_target(void); /** @@ -110,7 +113,6 @@ static __cpuinit int register_callback(unsigned type, const void *func) void __cpuinit xen_enable_sysenter(void) { - extern void xen_sysenter_target(void); int ret; unsigned sysenter_feature; @@ -132,8 +134,6 @@ void __cpuinit xen_enable_syscall(void) { #ifdef CONFIG_X86_64 int ret; - extern void xen_syscall_target(void); - extern void xen_syscall32_target(void); ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { @@ -160,7 +160,8 @@ void __init xen_arch_setup(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index acd9b6705e0..c44e2069c7c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -33,7 +33,7 @@ #include "xen-ops.h" #include "mmu.h" -cpumask_t xen_cpu_initialized_map; +cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); static DEFINE_PER_CPU(int, callfunc_irq); @@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void) { int i, rc; - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; @@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (xen_smp_intr_init(0)) BUG(); - xen_cpu_initialized_map = cpumask_of_cpu(0); + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) + panic("could not allocate xen_cpu_initialized_map\n"); + + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) continue; cpu_clear(cpu, cpu_possible_map); } @@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct vcpu_guest_context *ctxt; struct desc_struct *gdt; - if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); @@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +static void xen_send_IPI_mask(const struct cpumask *mask, + enum ipi_vector vector) { unsigned cpu; - cpus_and(mask, mask, cpu_online_map); - - for_each_cpu_mask_nr(cpu, mask) + for_each_cpu_and(cpu, mask, cpu_online_mask) xen_send_IPI_one(cpu, vector); } -static void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ - for_each_cpu_mask_nr(cpu, mask) { + for_each_cpu(cpu, mask) { if (xen_vcpu_stolen(cpu)) { HYPERVISOR_sched_op(SCHEDOP_yield, 0); break; @@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); + xen_send_IPI_mask(cpumask_of(cpu), + XEN_CALL_FUNCTION_SINGLE_VECTOR); } static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 2a234db5949..212ffe012b7 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled) pfn_to_mfn(xen_start_info->console.domU.mfn); } else { #ifdef CONFIG_SMP - xen_cpu_initialized_map = cpu_online_map; + BUG_ON(xen_cpu_initialized_map == NULL); + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); #endif xen_vcpu_restore(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c9f7cda48ed..14f24062349 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -132,8 +132,7 @@ static void do_stolen_accounting(void) *snap = state; /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. Passing NULL to - account_steal_time accounts the time as stolen. */ + including any left-overs from last time. */ stolen = runnable + offline + __get_cpu_var(residual_stolen); if (stolen < 0) @@ -141,11 +140,10 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; - account_steal_time(NULL, ticks); + account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. Passing idle to - account_steal_time accounts the time as idle/wait. */ + including any left-overs from last time. */ blocked += __get_cpu_var(residual_blocked); if (blocked < 0) @@ -153,7 +151,7 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; - account_steal_time(idle_task(smp_processor_id()), ticks); + account_idle_ticks(ticks); } /* @@ -437,7 +435,7 @@ void xen_setup_timer(int cpu) evt = &per_cpu(xen_clock_events, cpu); memcpy(evt, xen_clockevent, sizeof(*evt)); - evt->cpumask = cpumask_of_cpu(cpu); + evt->cpumask = cpumask_of(cpu); evt->irq = irq; setup_runstate_info(cpu); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9e1afae8461..c1f8faf0a2c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); void xen_uninit_lock_cpu(int cpu); -extern cpumask_t xen_cpu_initialized_map; +extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} #endif |