diff options
Diffstat (limited to 'arch/x86')
170 files changed, 7423 insertions, 4663 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1e588131f4..e98e81a0497 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -38,7 +38,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK @@ -150,7 +150,10 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y config HAVE_CPUMASK_OF_CPU_MAP @@ -179,6 +182,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool @@ -586,7 +593,6 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - select AGP depends on X86_64 && PCI ---help--- Support for full DMA access of devices with 32bit memory access only diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c..527519b8a9f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -262,6 +262,15 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) +config MATOM + bool "Intel Atom" + ---help--- + + Select this for the Intel Atom platform. Intel Atom CPUs have an + in-order pipelining architecture and thus can benefit from + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -295,7 +304,7 @@ config X86_CPU config X86_L1_CACHE_BYTES int default "128" if MPSC - default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32 + default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 config X86_INTERNODE_CACHE_BYTES int @@ -310,7 +319,7 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || MPSC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU config X86_XADD def_bool y @@ -359,7 +368,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM config X86_USE_3DNOW def_bool y @@ -387,7 +396,7 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 + depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 config X86_CMPXCHG64 def_bool y @@ -397,7 +406,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b..7983c420eaf 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ - echo $(call cc-option,-fno-unit-at-a-time); fi ;) + KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ + $(call cc-option,-fno-unit-at-a-time)) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/x86/Makefile_32.cpu @@ -55,6 +55,8 @@ else cflags-$(CONFIG_MCORE2) += \ $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ + $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) KBUILD_CFLAGS += $(cflags-y) @@ -72,7 +74,7 @@ endif ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) + ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) stackp-y := -fstack-protector stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all KBUILD_CFLAGS += $(stackp-y) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 80177ec052f..30e9a264f69 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -33,6 +33,8 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ + $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 275dd177f19..11e8c6eb80a 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { -#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; @@ -49,8 +48,7 @@ static int vesa_probe(void) vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ -#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ -#ifdef CONFIG_VIDEO_VESA + set_fs(vginfo.video_mode_ptr.seg); mode_ptr = vginfo.video_mode_ptr.off; @@ -102,9 +100,6 @@ static int vesa_probe(void) } return nmodes; -#else - return 0; -#endif /* CONFIG_VIDEO_VESA */ } static int vesa_set_mode(struct mode_info *mode) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 8f8d827e254..819caa1f200 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void) initregs(&ireg); -#ifdef CONFIG_VIDEO_400_HACK - if (adapter >= ADAPTER_VGA) { - ireg.ax = 0x1202; - ireg.bx = 0x0030; - intcall(0x10, &ireg, NULL); - } -#endif - ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; @@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void) set_fs(0); rows = rdfs8(0x484); /* rows minus one */ -#ifndef CONFIG_VIDEO_400_HACK if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; -#endif if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index bad728b76fc..d42da380249 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -221,7 +221,6 @@ static unsigned int mode_menu(void) } } -#ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ static struct saved_screen { int x, y; @@ -299,10 +298,6 @@ static void restore_screen(void) ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); } -#else -#define save_screen() ((void)0) -#define restore_screen() ((void)0) -#endif void set_video(void) { diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 5bb174a997f..ff339c5db31 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -17,19 +17,8 @@ #include <linux/types.h> -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* This code uses an extended set of video mode numbers. These include: +/* + * This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) * EXTENDED_VGA (-2) @@ -67,13 +56,8 @@ /* The "recalculate timings" flag */ #define VIDEO_RECALC 0x8000 -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN void store_screen(void); #define DO_STORE() store_screen() -#else -#define DO_STORE() ((void)0) -#endif /* CONFIG_VIDEO_RETAIN */ /* * Mode table structures diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992ebef9..d28fad19654 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2e69b..6c86acd847a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index c580c5ec1ca..585edebe12c 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -59,13 +59,6 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -static inline int kernel_fpu_using(void) -{ - if (in_interrupt() && !(read_cr0() & X86_CR0_TS)) - return 1; - return 0; -} - static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { unsigned long addr = (unsigned long)raw_ctx; @@ -89,7 +82,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, return -EINVAL; } - if (kernel_fpu_using()) + if (irq_fpu_usable()) err = crypto_aes_expand_key(ctx, in_key, key_len); else { kernel_fpu_begin(); @@ -110,7 +103,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (kernel_fpu_using()) + if (irq_fpu_usable()) crypto_aes_encrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -123,7 +116,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (kernel_fpu_using()) + if (irq_fpu_usable()) crypto_aes_decrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -349,7 +342,7 @@ static int ablk_encrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (kernel_fpu_using()) { + if (irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); @@ -370,7 +363,7 @@ static int ablk_decrypt(struct ablkcipher_request *req) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - if (kernel_fpu_using()) { + if (irq_fpu_usable()) { struct ablkcipher_request *cryptd_req = ablkcipher_request_ctx(req); memcpy(cryptd_req, req, sizeof(*req)); @@ -636,7 +629,7 @@ static int __init aesni_init(void) int err; if (!cpu_has_aes) { - printk(KERN_ERR "Intel AES-NI instructions are not detected.\n"); + printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); return -ENODEV; } if ((err = crypto_register_alg(&aesni_alg))) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e590261ba05..ba331bfd111 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -537,7 +537,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 085a8c35f14..9f552719882 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, return sys_mprotect(start, len, prot); } -asmlinkage long sys32_pipe(int __user *fd) -{ - int retval; - int fds[2]; - - retval = do_pipe_flags(fds, 0); - if (retval) - goto out; - if (copy_to_user(fd, fds, sizeof(fds))) - retval = -EFAULT; -out: - return retval; -} - asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, struct sigaction32 __user *oact, unsigned int sigsetsize) diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1a37bcdc860..c240efc74e0 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -73,8 +73,6 @@ static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} #endif /* CONFIG_SMP */ -const unsigned char *const *find_nop_table(void); - /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ \ @@ -144,8 +142,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif -extern void add_nops(void *insns, unsigned int len); - /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. @@ -161,10 +157,7 @@ extern void add_nops(void *insns, unsigned int len); * Intel's errata. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. - * The _early version expects the memory to already be RW. */ - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_early(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index bdf96f119f0..ac95995b7ba 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -25,6 +25,7 @@ #ifdef CONFIG_AMD_IOMMU extern int amd_iommu_init(void); extern int amd_iommu_init_dma_ops(void); +extern int amd_iommu_init_passthrough(void); extern void amd_iommu_detect(void); extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_flush_all_domains(void); diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 0c878caaa0a..2a2cc7a78a8 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -143,22 +143,29 @@ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ #define EVT_LEN_MASK (0x9ULL << 56) +#define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 #define PAGE_MODE_3_LEVEL 0x03 - -#define IOMMU_PDE_NL_0 0x000ULL -#define IOMMU_PDE_NL_1 0x200ULL -#define IOMMU_PDE_NL_2 0x400ULL -#define IOMMU_PDE_NL_3 0x600ULL - -#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) -#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) -#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) - -#define IOMMU_MAP_SIZE_L1 (1ULL << 21) -#define IOMMU_MAP_SIZE_L2 (1ULL << 30) -#define IOMMU_MAP_SIZE_L3 (1ULL << 39) +#define PAGE_MODE_4_LEVEL 0x04 +#define PAGE_MODE_5_LEVEL 0x05 +#define PAGE_MODE_6_LEVEL 0x06 + +#define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) +#define PM_LEVEL_SIZE(x) (((x) < 6) ? \ + ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ + (0xffffffffffffffffULL)) +#define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) +#define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) +#define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ + IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) + +#define PM_MAP_4k 0 +#define PM_ADDR_MASK 0x000ffffffffff000ULL +#define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ + (~((1ULL << (12 + ((lvl) * 9))) - 1))) +#define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) #define IOMMU_PTE_P (1ULL << 0) #define IOMMU_PTE_TV (1ULL << 1) @@ -167,11 +174,6 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) -#define IOMMU_L1_PDE(address) \ - ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) -#define IOMMU_L2_PDE(address) \ - ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) - #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) @@ -194,11 +196,14 @@ #define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ #define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops domain for an IOMMU */ +#define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page + translation */ + extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ do { \ if (amd_iommu_dump) \ - printk(KERN_INFO "AMD IOMMU: " format, ## arg); \ + printk(KERN_INFO "AMD-Vi: " format, ## arg); \ } while(0); /* @@ -226,6 +231,7 @@ struct protection_domain { int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ unsigned long flags; /* flags to find out type of domain */ + bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ void *priv; /* private data */ }; @@ -337,6 +343,9 @@ struct amd_iommu { /* if one, we need to send a completion wait command */ bool need_sync; + /* becomes true if a command buffer reset is running */ + bool reset_in_progress; + /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; }; @@ -457,4 +466,7 @@ static inline void amd_iommu_stats_init(void) { } #endif /* CONFIG_AMD_IOMMU_STATS */ +/* some function prototypes */ +extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d4792584..586b7adb8e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -183,6 +183,10 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) +static inline void x2apic_force_phys(void) +{ + x2apic_phys = 1; +} #else static inline void check_x2apic(void) { @@ -194,6 +198,9 @@ static inline int x2apic_enabled(void) { return 0; } +static inline void x2apic_force_phys(void) +{ +} #define x2apic_preenabled 0 #define x2apic_supported() 0 diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7ddb36ab933..3b62da926de 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -8,12 +8,14 @@ * Ingo Molnar <mingo@redhat.com>, 1999, 2000 */ -#define APIC_DEFAULT_PHYS_BASE 0xfee00000 +#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 #define APIC_ID 0x20 #define APIC_LVR 0x30 #define APIC_LVR_MASK 0xFF00FF +#define APIC_LVR_DIRECTED_EOI (1 << 24) #define GET_APIC_VERSION(x) ((x) & 0xFFu) #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) #ifdef CONFIG_X86_32 @@ -40,6 +42,7 @@ #define APIC_DFR_CLUSTER 0x0FFFFFFFul #define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 +#define APIC_SPIV_DIRECTED_EOI (1 << 12) #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_ISR 0x100 diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 56be78f582f..b3ed1e1460f 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,7 +3,7 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x -# define __ASM_EX_SEC .section __ex_table +# define __ASM_EX_SEC .section __ex_table, "a" #else # define __ASM_FORM(x) " " #x " " # define __ASM_EX_SEC " .section __ex_table,\"a\"\n" @@ -38,10 +38,18 @@ #define _ASM_DI __ASM_REG(di) /* Exception table entry */ +#ifdef __ASSEMBLY__ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC ; \ + _ASM_ALIGN ; \ + _ASM_PTR from , to ; \ + .previous +#else # define _ASM_EXTABLE(from,to) \ __ASM_EX_SEC \ _ASM_ALIGN "\n" \ _ASM_PTR #from "," #to "\n" \ " .previous\n" +#endif #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..6ca20218dd7 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -85,7 +85,8 @@ struct efi_info { struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - __u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[4]; /* 0x054 */ + __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4a28d22d479..847fee6493a 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -95,6 +95,7 @@ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c68c361697e..4d447b732d8 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read(current_task); + return percpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c993e9e0fed..e8de2f6f5ca 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc) return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); } +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + static inline unsigned long get_desc_limit(const struct desc_struct *desc) { return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a6adefa28b9..9d6684849fd 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -34,6 +34,12 @@ struct desc_struct { }; } __attribute__((packed)); +#define GDT_ENTRY_INIT(flags, base, limit) { { { \ + .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ + .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ + ((limit) & 0xf0000) | ((base) & 0xff000000), \ + } } } + enum { GATE_INTERRUPT = 0xE, GATE_TRAP = 0xF, diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 4994a20acbc..cee34e9ca45 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops; #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_X86_DEVICE_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c3f9435f1c..0ee770d23d0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -55,6 +55,24 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 3afc5e87cfd..ae6253ab902 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -87,9 +87,25 @@ CFI_RESTORE \reg .endm #else /*!CONFIG_X86_64*/ + .macro pushl_cfi reg + pushl \reg + CFI_ADJUST_CFA_OFFSET 4 + .endm - /* 32bit defenitions are missed yet */ + .macro popl_cfi reg + popl \reg + CFI_ADJUST_CFA_OFFSET -4 + .endm + .macro movl_cfi reg offset=0 + movl %\reg, \offset(%esp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movl_cfi_restore offset reg + movl \offset(%esp), %\reg + CFI_RESTORE \reg + .endm #endif /*!CONFIG_X86_64*/ #endif /*__ASSEMBLY__*/ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a..14f9890eb49 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -132,6 +132,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif __end_of_fixed_addresses }; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index bd2c6511c88..db24c2278be 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,13 +28,6 @@ #endif -/* FIXME: I don't want to stay hardcoded */ -#ifdef CONFIG_X86_64 -# define FTRACE_SYSCALL_MAX 296 -#else -# define FTRACE_SYSCALL_MAX 333 -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4..0b20bbb758f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -26,6 +26,7 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); +extern void __math_state_restore(void); extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); @@ -301,6 +302,14 @@ static inline void kernel_fpu_end(void) preempt_enable(); } +static inline bool irq_fpu_usable(void) +{ + struct pt_regs *regs; + + return !in_interrupt() || !(regs = get_irq_regs()) || \ + user_mode(regs) || (read_cr0() & X86_CR0_TS); +} + /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 330ee807f89..85232d32fcb 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -150,11 +150,10 @@ extern int timer_through_8259; #define io_apic_assign_pci_irqs \ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) -#ifdef CONFIG_ACPI +extern u8 io_apic_unique_id(u8 id); extern int io_apic_get_unique_id(int ioapic, int apic_id); extern int io_apic_get_version(int ioapic); extern int io_apic_get_redir_entries(int ioapic); -#endif /* CONFIG_ACPI */ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, @@ -177,6 +176,16 @@ extern int setup_ioapic_entry(int apic, int irq, int polarity, int vector, int pin); extern void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e); + +struct mp_ioapic_gsi{ + int gsi_base; + int gsi_end; +}; +extern struct mp_ioapic_gsi mp_gsi_routing[]; +int mp_find_ioapic(int gsi); +int mp_find_ioapic_pin(int ioapic, int gsi); +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); + #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 static const int timer_through_8259 = 0; diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06..ec34c760665 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,94 +1 @@ -#ifndef _ASM_X86_IOCTLS_H -#define _ASM_X86_IOCTLS_H - -#include <asm/ioctl.h> - -/* 0x54 is just a magic number to make these relatively unique ('T') */ - -#define TCGETS 0x5401 -#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ -#define TCSETSW 0x5403 -#define TCSETSF 0x5404 -#define TCGETA 0x5405 -#define TCSETA 0x5406 -#define TCSETAW 0x5407 -#define TCSETAF 0x5408 -#define TCSBRK 0x5409 -#define TCXONC 0x540A -#define TCFLSH 0x540B -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E -#define TIOCGPGRP 0x540F -#define TIOCSPGRP 0x5410 -#define TIOCOUTQ 0x5411 -#define TIOCSTI 0x5412 -#define TIOCGWINSZ 0x5413 -#define TIOCSWINSZ 0x5414 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define FIONREAD 0x541B -#define TIOCINQ FIONREAD -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -#define FIONBIO 0x5421 -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TCGETS2 _IOR('T', 0x2A, struct termios2) -#define TCSETS2 _IOW('T', 0x2B, struct termios2) -#define TCSETSW2 _IOW('T', 0x2C, struct termios2) -#define TCSETSF2 _IOW('T', 0x2D, struct termios2) -#define TIOCGRS485 0x542E -#define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) - /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ -#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ -#define TCSETX 0x5433 -#define TCSETXF 0x5434 -#define TCSETXW 0x5435 - -#define FIONCLEX 0x5450 -#define FIOCLEX 0x5451 -#define FIOASYNC 0x5452 -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ -#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -#define FIOQSIZE 0x5460 - -/* Used for packet mode */ -#define TIOCPKT_DATA 0 -#define TIOCPKT_FLUSHREAD 1 -#define TIOCPKT_FLUSHWRITE 2 -#define TIOCPKT_STOP 4 -#define TIOCPKT_START 8 -#define TIOCPKT_NOSTOP 16 -#define TIOCPKT_DOSTOP 32 - -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -#endif /* _ASM_X86_IOCTLS_H */ +#include <asm-generic/ioctls.h> diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd5159..84c7e51cb6d 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,28 +1 @@ -#ifndef _ASM_X86_IPCBUF_H -#define _ASM_X86_IPCBUF_H - -/* - * The ipc64_perm structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space is left for: - * - 32-bit mode_t and seq - * - 2 miscellaneous 32-bit values - */ - -struct ipc64_perm { - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - __kernel_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* _ASM_X86_IPCBUF_H */ +#include <asm-generic/ipcbuf.h> diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c6ccbe7e81a..9e2b952f810 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -13,14 +13,13 @@ static inline unsigned long native_save_fl(void) unsigned long flags; /* - * Note: this needs to be "=r" not "=rm", because we have the - * stack offset from what gcc expects at the time the "pop" is - * executed, and so a memory reference with respect to the stack - * would end up using the wrong address. + * "=rm" is safe here, because "pop" adjusts the stack before + * it evaluates its effective address -- this is part of the + * documented behavior of the "pop" instruction. */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=r" (flags) + : "=rm" (flags) : /* no input */ : "memory"); diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b1956..4a5fe914dc5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -17,6 +17,8 @@ #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -236,6 +238,14 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7ed2c42311..b7ed2c42311 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5..3be000435fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> +#include <linux/tracepoint.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -37,12 +38,14 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ - | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ - | X86_CR0_MP) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_GUEST_CR4_MASK \ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -51,12 +54,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -/* shadow tables are PAE even on non-PAE hosts */ -#define KVM_HPAGE_SHIFT 21 -#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) -#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) - -#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 3 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define DE_VECTOR 0 #define DB_VECTOR 1 @@ -120,6 +123,10 @@ enum kvm_reg { NR_VCPU_REGS }; +enum kvm_reg_ex { + VCPU_EXREG_PDPTR = NR_VCPU_REGS, +}; + enum { VCPU_SREG_ES, VCPU_SREG_CS, @@ -131,7 +138,7 @@ enum { VCPU_SREG_LDTR, }; -#include <asm/kvm_x86_emulate.h> +#include <asm/kvm_emulate.h> #define KVM_NR_MEM_OBJS 40 @@ -308,7 +315,6 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int largepage; unsigned long mmu_seq; } update_pte; @@ -334,16 +340,6 @@ struct kvm_vcpu_arch { u8 nr; } interrupt; - struct { - int vm86_active; - u8 save_iopl; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; int halt_request; /* real mode on Intel only */ int cpuid_nent; @@ -366,13 +362,15 @@ struct kvm_vcpu_arch { u32 pat; int switch_db_regs; - unsigned long host_db[KVM_NR_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; + + u64 mcg_cap; + u64 mcg_status; + u64 mcg_ctl; + u64 *mce_banks; }; struct kvm_mem_alias { @@ -409,6 +407,7 @@ struct kvm_arch{ struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; @@ -526,6 +525,9 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + bool (*gb_page_enable)(void); + + const struct trace_print_flags *exit_reasons_str; }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -618,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); @@ -752,8 +755,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define MSR_IA32_TIME_STAMP_COUNTER 0x010 - #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) @@ -796,5 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae09..c584076a47f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H +#include <linux/types.h> + /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 5136dad57cb..0d97deba1e3 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -90,8 +90,9 @@ static inline void lguest_set_ts(void) } /* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) -#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) +#define FULL_EXEC_SEGMENT \ + ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) +#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed..593e51d4643 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include <asm-generic/mman-common.h> - #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include <asm-generic/mman.h> #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d..3e2ce58a31a 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,18 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; - -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr -#else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr -#endif +#include <asm-generic/module.h> #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ @@ -28,6 +17,8 @@ struct mod_arch_specific {}; #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MATOM +#define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 #define MODULE_PROC_FAMILY "686 " #elif defined CONFIG_MPENTIUMII diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51..809134c644a 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,39 +1 @@ -#ifndef _ASM_X86_MSGBUF_H -#define _ASM_X86_MSGBUF_H - -/* - * The msqid64_ds structure for i386 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on i386 is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on x8664 is left for: - * - 2 miscellaneous 64-bit values - */ -struct msqid64_ds { - struct ipc64_perm msg_perm; - __kernel_time_t msg_stime; /* last msgsnd time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t msg_rtime; /* last msgrcv time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t msg_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused4; - unsigned long __unused5; -}; - -#endif /* _ASM_X86_MSGBUF_H */ +#include <asm-generic/msgbuf.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b5..bd5549034a9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -374,6 +374,7 @@ /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 +#define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 #endif /* _ASM_X86_MSR_INDEX_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 48ad9d29484..7e2b6ba962f 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -3,10 +3,16 @@ #include <asm/msr-index.h> -#ifdef __KERNEL__ #ifndef __ASSEMBLY__ #include <linux/types.h> +#include <linux/ioctl.h> + +#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) +#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) + +#ifdef __KERNEL__ + #include <asm/asm.h> #include <asm/errno.h> #include <asm/cpumask.h> @@ -67,23 +73,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, ".previous\n\t" _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), [fault] "i" (-EFAULT)); - return EAX_EDX_VAL(val, low, high); -} - -static inline unsigned long long native_read_msr_amd_safe(unsigned int msr, - int *err) -{ - DECLARE_ARGS(val, low, high); - - asm volatile("2: rdmsr ; xor %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %3,%0 ; jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) - : "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT)); + : "c" (msr), [fault] "i" (-EIO)); return EAX_EDX_VAL(val, low, high); } @@ -106,13 +96,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : [err] "=a" (err) : "c" (msr), "0" (low), "d" (high), - [fault] "i" (-EFAULT) + [fault] "i" (-EIO) : "memory"); return err; } extern unsigned long long native_read_tsc(void); +extern int native_rdmsr_safe_regs(u32 regs[8]); +extern int native_wrmsr_safe_regs(u32 regs[8]); + static __always_inline unsigned long long __native_read_tsc(void) { DECLARE_ARGS(val, low, high); @@ -181,14 +174,44 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = native_read_msr_safe(msr, &err); return err; } + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { + u32 gprs[8] = { 0 }; int err; - *p = native_read_msr_amd_safe(msr, &err); + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = native_rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + return err; } +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return native_wrmsr_safe_regs(gprs); +} + +static inline int rdmsr_safe_regs(u32 regs[8]) +{ + return native_rdmsr_safe_regs(regs); +} + +static inline int wrmsr_safe_regs(u32 regs[8]) +{ + return native_wrmsr_safe_regs(regs); +} + #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) @@ -228,6 +251,8 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { @@ -258,7 +283,15 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } +static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) +{ + return rdmsr_safe_regs(regs); +} +static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) +{ + return wrmsr_safe_regs(regs); +} #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c86e5ed4af5..e63cf7d441e 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace static inline void localise_nmi_watchdog(void) { diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index ad2668ee1aa..6d8723a766c 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -65,6 +65,8 @@ 6: osp nopl 0x00(%eax,%eax,1) 7: nopl 0x00000000(%eax) 8: nopl 0x00000000(%eax,%eax,1) + Note: All the above are assumed to be a single instruction. + There is kernel code that depends on this. */ #define P6_NOP1 GENERIC_NOP1 #define P6_NOP2 ".byte 0x66,0x90\n" diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4c..965d4542797 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,22 +1 @@ -#ifndef _ASM_X86_PARAM_H -#define _ASM_X86_PARAM_H - -#ifdef __KERNEL__ -# define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ -# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif /* _ASM_X86_PARAM_H */ +#include <asm-generic/param.h> diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a083..40d6586af25 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,689 +7,11 @@ #include <asm/pgtable_types.h> #include <asm/asm.h> -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - -#include <asm/desc_defs.h> -#endif /* X86_64 */ - -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) +#include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/cpumask.h> -#include <asm/kmap_types.h> -#include <asm/desc_defs.h> - -struct page; -struct thread_struct; -struct desc_ptr; -struct tss_struct; -struct mm_struct; -struct desc_struct; -struct task_struct; - -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - -/* general info */ -struct pv_info { - unsigned int kernel_rpl; - int shared_kernel_pmd; - int paravirt_enabled; - const char *name; -}; - -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, - unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); -}; - - -struct pv_lazy_ops { - /* Set deferred update mode, used for batching operations. */ - void (*enter)(void); - void (*leave)(void); -}; - -struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - - unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); -}; - -struct pv_cpu_ops { - /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); - - void (*clts)(void); - - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - - /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); - void (*store_gdt)(struct desc_ptr *); - void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 - void (*load_gs_index)(unsigned int idx); -#endif - void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, - const void *desc); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, const void *desc, int size); - void (*write_idt_entry)(gate_desc *, - int entrynum, const gate_desc *gate); - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); - - void (*set_iopl_mask)(unsigned mask); - - void (*wbinvd)(void); - void (*io_delay)(void); - - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); - - /* - * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) - */ - void (*irq_enable_sysexit)(void); - - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - - void (*swapgs)(void); - - void (*start_context_switch)(struct task_struct *prev); - void (*end_context_switch)(struct task_struct *next); -}; - -struct pv_irq_ops { - void (*init_IRQ)(void); - - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. - */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - - void (*safe_halt)(void); - void (*halt)(void); - -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; - -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - -struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long va); - - /* Hooks for allocating and freeing a pagetable top-level */ - int (*pgd_alloc)(struct mm_struct *mm); - void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); - - /* - * Hooks for allocating/releasing pagetable pages when they're - * attached to a pagetable - */ - void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); - void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); - void (*release_pte)(unsigned long pfn); - void (*release_pmd)(unsigned long pfn); - void (*release_pud)(unsigned long pfn); - - /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; - - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - - void (*set_pud)(pud_t *pudp, pud_t pudval); - - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; - -#if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; - - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ - -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - - struct pv_lazy_ops lazy_mode; - - /* dom0 ops */ - - /* Sometimes the physical address is a pfn, and sometimes its - an mfn. We can tell which is which from the index. */ - void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - phys_addr_t phys, pgprot_t flags); -}; - -struct raw_spinlock; -struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); -}; - -/* This contains all the paravirt structures: we get a convenient - * number for each function using the offset which we use to indicate - * what to patch. */ -struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; -}; - -extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; - -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - -unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); - -int paravirt_disable_iospace(void); - -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" - -/* - * These macros are intended to wrap calls through one of the paravirt - * ops structs, so that they can be later identified and patched at - * runtime. - * - * Normally, a call to a pv_op function is a simple indirect call: - * (pv_op_struct.operations)(args...). - * - * Unfortunately, this is a relatively slow operation for modern CPUs, - * because it cannot necessarily determine what the destination - * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or - * ideally, patch an inline implementation into the callsite. (Direct - * calls are essentially free, because the call and return addresses - * are completely predictable.) - * - * For i386, these macros rely on the standard gcc "regparm(3)" calling - * convention, in which the first three arguments are placed in %eax, - * %edx, %ecx (in that order), and the remaining arguments are placed - * on the stack. All caller-save registers (eax,edx,ecx) are expected - * to be modified (either clobbered or used for return values). - * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, - * %rdx, and %rcx. Note that for this reason, x86_64 does not need any - * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which - * unfortunately, are quite a bit (r8 - r11) - * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * - * Unfortunately there's no way to get gcc to generate the args setup - * for the call, and then allow the call itself to be generated by an - * inline asm. Because of this, we must do the complete arg setup and - * return value handling from within these macros. This is fairly - * cumbersome. - * - * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. - * It could be extended to more arguments, but there would be little - * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. - * - * When there is a return value, the invoker of the macro must specify - * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return - * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of - * the return value size. - * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments - * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments - * in low,high order - * - * Small structures are passed and returned in registers. The macro - * calling convention can't directly deal with this, so the wrapper - * functions must do this. - * - * These PVOP_* macros are only defined within this header. This - * means that all uses must be wrapped in inline functions. This also - * makes sure the incoming and outgoing types are always correct. - */ -#ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS -#define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ - "=S" (__esi), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" -#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) -#else -#define PVOP_TEST_NULL(op) ((void)op) -#endif - -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ - ({ \ - rettype __ret; \ - PVOP_CALL_ARGS; \ - PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ - } \ - __ret; \ - }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ - ({ \ - PVOP_VCALL_ARGS; \ - PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - }) - -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - -#define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") -#define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") - -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - -#define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - - -#define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - - -#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -#define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) - -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif static inline int paravirt_enabled(void) { @@ -820,15 +142,22 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) { return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } -static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) + +static inline int paravirt_rdmsr_regs(u32 *regs) { - return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); + return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); } + static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } +static inline int paravirt_wrmsr_regs(u32 *regs) +{ + return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); +} + /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -862,6 +191,9 @@ do { \ _err; \ }) +#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) +#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) + static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; @@ -871,12 +203,31 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) } static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { + u32 gprs[8] = { 0 }; int err; - *p = paravirt_read_msr_amd(msr, &err); + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = paravirt_rdmsr_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + return err; } +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return paravirt_wrmsr_regs(gprs); +} + static inline u64 paravirt_read_tsc(void) { return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); @@ -1393,20 +744,6 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); - #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { @@ -1437,12 +774,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, pv_mmu_ops.set_fixmap(idx, phys, flags); } -void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - -#define paravirt_nop ((void *)_paravirt_nop) - #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static inline int __raw_spin_is_locked(struct raw_spinlock *lock) @@ -1479,17 +810,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) #endif -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ - u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ -}; - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" #define PV_RESTORE_REGS "popl %edx; popl %ecx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 00000000000..25402d0006e --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h @@ -0,0 +1,721 @@ +#ifndef _ASM_X86_PARAVIRT_TYPES_H +#define _ASM_X86_PARAVIRT_TYPES_H + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) + +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) +#define CLBR_SCRATCH (0) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) + +#define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + +#endif /* X86_64 */ + +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + +#ifndef __ASSEMBLY__ + +#include <asm/desc_defs.h> +#include <asm/kmap_types.h> + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; +struct task_struct; +struct cpumask; + +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr)(unsigned int msr, int *err); + int (*rdmsr_regs)(u32 *regs); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + int (*wrmsr_regs)(u32 *regs); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. + */ + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; + + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; + +#if PAGETABLE_LEVELS == 4 + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "i" (&(op)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + }) + +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#endif + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); + +void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + +#define paravirt_nop ((void *)_paravirt_nop) + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PARAVIRT_TYPES_H */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 1ff685ca221..f76a162c082 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -48,7 +48,6 @@ extern unsigned int pcibios_assign_all_busses(void); #else #define pcibios_assign_all_busses() 0 #endif -#define pcibios_scan_all_fns(a, b) 0 extern unsigned long pci_mem_start; #define PCIBIOS_MIN_IO 0x1000 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..b65a36defeb 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -49,7 +49,7 @@ #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __my_cpu_offset percpu_read(this_cpu_off) #else -#define __percpu_arg(x) "%" #x +#define __percpu_arg(x) "%P" #x #endif /* @@ -104,36 +104,48 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ default: __bad_percpu_size(); \ } \ ret__; \ }) -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) +/* + * percpu_read() makes gcc load the percpu variable every time it is + * accessed while percpu_read_stable() allows the value to be cached. + * percpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across cpus. The current users include + * get_current() and get_thread_info() both of which are actually + * per-thread variables implemented as per-cpu variables and thus + * stable for the duration of the respective task. + */ +#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ + "m" (per_cpu__##var)) +#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ + "p" (&per_cpu__##var)) #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) @@ -156,15 +168,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index fa64e401589..e7b7c938ae2 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,16 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed counter range, since lower + * values are used by actual fixed counters and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 16748077559..4c5b51fdc78 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -135,6 +135,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -359,7 +364,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -379,7 +384,7 @@ static inline unsigned pmd_index(unsigned long address) * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -430,11 +435,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == @@ -470,7 +470,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c7768269b1c..42a3f936dad 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -403,7 +403,17 @@ extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR -DECLARE_PER_CPU(unsigned long, stack_canary); +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif #endif /* X86_64 */ @@ -703,13 +713,23 @@ static inline void cpu_relax(void) rep_nop(); } -/* Stop speculative execution: */ +/* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) - : "ebx", "ecx", "edx", "memory"); +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); } static inline void __monitor(const void *eax, unsigned long ecx, @@ -1000,4 +1020,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +extern int amd_get_nb_id(int cpu); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2ee..75af592677e 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,33 +1,8 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#include <asm/types.h> - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -#define ARCH_HAS_SG_CHAIN #define ISA_DMA_THRESHOLD (0x00ffffff) -/* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) -#else -# define sg_dma_len(sg) ((sg)->dma_length) -#endif +#include <asm-generic/scatterlist.h> #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b7497..83c05fc2de3 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,51 +1 @@ -#ifndef _ASM_X86_SHMBUF_H -#define _ASM_X86_SHMBUF_H - -/* - * The shmid64_ds structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * Pad space on 32 bit is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - * - * Pad space on 64 bit is left for: - * - 2 miscellaneous 64-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ -#ifdef __i386__ - unsigned long __unused1; -#endif - __kernel_time_t shm_dtime; /* last detach time */ -#ifdef __i386__ - unsigned long __unused2; -#endif - __kernel_time_t shm_ctime; /* last change time */ -#ifdef __i386__ - unsigned long __unused3; -#endif - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused4; - unsigned long __unused5; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* _ASM_X86_SHMBUF_H */ +#include <asm-generic/shmbuf.h> diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index ca8bf2cd0ba..6b71384b9d8 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,60 +1 @@ -#ifndef _ASM_X86_SOCKET_H -#define _ASM_X86_SOCKET_H - -#include <asm/sockios.h> - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ -#define SO_PASSCRED 16 -#define SO_PEERCRED 17 -#define SO_RCVLOWAT 18 -#define SO_SNDLOWAT 19 -#define SO_RCVTIMEO 20 -#define SO_SNDTIMEO 21 - -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#endif /* _ASM_X86_SOCKET_H */ +#include <asm-generic/socket.h> diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c..def6d4746ee 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,13 +1 @@ -#ifndef _ASM_X86_SOCKIOS_H -#define _ASM_X86_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* _ASM_X86_SOCKIOS_H */ +#include <asm-generic/sockios.h> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2d742c6e15..15751776356 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -48,7 +48,7 @@ * head_32 for boot CPU and setup_per_cpu_areas() for others. */ #define GDT_STACK_CANARY_INIT \ - [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, + [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), /* * Initialize the stackprotector canary value. @@ -78,21 +78,19 @@ static __always_inline void boot_init_stack_canary(void) #ifdef CONFIG_X86_64 percpu_write(irq_stack_union.stack_canary, canary); #else - percpu_write(stack_canary, canary); + percpu_write(stack_canary.canary, canary); #endif } static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 - unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20; + unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; - desc.base0 = canary & 0xffff; - desc.base1 = (canary >> 16) & 0xff; - desc.base2 = (canary >> 24) & 0xff; + set_desc_base(&desc, canary); write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); #endif } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 643c59b4bc6..f08f9737489 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, "movl %P[task_canary](%[next]), %%ebx\n\t" \ "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ - , [stack_canary] "=m" (per_cpu_var(stack_canary)) + , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) #define __switch_canary_iparam \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ @@ -150,33 +150,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void native_load_gs_index(unsigned); diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440..3935b106de7 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,198 +1 @@ -#ifndef _ASM_X86_TERMBITS_H -#define _ASM_X86_TERMBITS_H - -#include <linux/posix_types.h> - -typedef unsigned char cc_t; -typedef unsigned int speed_t; -typedef unsigned int tcflag_t; - -#define NCCS 19 -struct termios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ -}; - -struct termios2 { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -struct ktermios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -/* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - -/* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 - -/* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 - -/* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 -#define EXTA B19200 -#define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 /* non standard rate */ -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - -/* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - -/* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 - -#endif /* _ASM_X86_TERMBITS_H */ +#include <asm-generic/termbits.h> diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index c4ee8056bac..280d78a9d96 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,114 +1 @@ -#ifndef _ASM_X86_TERMIOS_H -#define _ASM_X86_TERMIOS_H - -#include <asm/termbits.h> -#include <asm/ioctls.h> - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - -#ifdef __KERNEL__ - -#include <asm/uaccess.h> - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ - unsigned short __tmp; \ - get_user(__tmp,&(termio)->x); \ - *(unsigned short *) &(termios)->x = __tmp; \ -} - -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) -{ - SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); - get_user(termios->c_line, &termio->c_line); - return copy_from_user(termios->c_cc, termio->c_cc, NCC); -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - put_user((termios)->c_iflag, &(termio)->c_iflag); - put_user((termios)->c_oflag, &(termio)->c_oflag); - put_user((termios)->c_cflag, &(termio)->c_cflag); - put_user((termios)->c_lflag, &(termio)->c_lflag); - put_user((termios)->c_line, &(termio)->c_line); - return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); -} - -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios2 __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios2)); -} - -static inline int kernel_termios_to_user_termios(struct termios2 __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios2)); -} - -static inline int user_termios_to_kernel_termios_1(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios_1(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_TERMIOS_H */ +#include <asm-generic/termios.h> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f..d27d0a2fec4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,17 +118,17 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) +#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ - _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_FTRACE) + _TIF_SYSCALL_TRACEPOINT) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -137,7 +137,8 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) +#define _TIF_ALLWORK_MASK \ + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ @@ -213,7 +214,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read(kernel_stack) + + ti = (void *)(percpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 066ef590d7e..26d06e052a1 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; #endif /* sched_domains SD_NODE_INIT for NUMA machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = SD_CACHE_NICE_TRIES, \ - .busy_idx = 3, \ - .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ - .forkexec_idx = SD_FORKEXEC_IDX, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_NODE_INIT (struct sched_domain) { \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_nice_tries = SD_CACHE_NICE_TRIES, \ + .busy_idx = 3, \ + .idle_idx = SD_IDLE_IDX, \ + .newidle_idx = SD_NEWIDLE_IDX, \ + .wake_idx = 1, \ + .forkexec_idx = SD_FORKEXEC_IDX, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SERIALIZE \ + | 1*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #ifdef CONFIG_X86_64_ACPI_NUMA diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bfd74c032fc..4da91ad69e0 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi; void math_error(void __user *); void math_emulate(struct math_emu_info *); -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long, unsigned long); -#else +#ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772..df1da20f453 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,19 +1,11 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#include <asm-generic/int-ll64.h> +#define dma_addr_t dma_addr_t -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; +#include <asm-generic/types.h> -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d..b7c29c8017f 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,12 +7,6 @@ * sigcontext struct (uc_mcontext). */ -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; +#include <asm-generic/ucontext.h> #endif /* _ASM_X86_UCONTEXT_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 732a3070615..8deaada61bc 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -345,6 +345,8 @@ #ifdef __KERNEL__ +#define NR_syscalls 337 + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 900e1617e67..b9f3c60de5f 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #endif /* __NO_STUBS */ #ifdef __KERNEL__ + +#ifndef COMPILE_OFFSETS +#include <asm/asm-offsets.h> +#define NR_syscalls (__NR_syscall_max + 1) +#endif + /* * "Conditional" syscalls * diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e..272514c2d45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -55,6 +55,7 @@ #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -351,9 +352,16 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..832cb838cb4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285..67e929b8987 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -833,106 +833,6 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static struct { - int gsi_base; - int gsi_end; -} mp_ioapic_routing[MAX_IO_APICS]; - -int mp_find_ioapic(int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; -} - -int mp_find_ioapic_pin(int ioapic, int gsi) -{ - if (WARN_ON(ioapic == -1)) - return -1; - if (WARN_ON(gsi > mp_ioapic_routing[ioapic].gsi_end)) - return -1; - - return gsi - mp_ioapic_routing[ioapic].gsi_base; -} - -static u8 __init uniq_ioapic_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif -} - -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) -{ - int idx = 0; - - if (bad_ioapic(address)) - return; - - idx = nr_ioapics; - - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; - - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = uniq_ioapic_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); - - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - nr_ioapics++; -} - int __init acpi_probe_gsi(void) { int idx; @@ -947,7 +847,7 @@ int __init acpi_probe_gsi(void) max_gsi = 0; for (idx = 0; idx < nr_ioapics; idx++) { - gsi = mp_ioapic_routing[idx].gsi_end; + gsi = mp_gsi_routing[idx].gsi_end; if (gsi > max_gsi) max_gsi = gsi; @@ -1179,9 +1079,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f5765870257..de7353c0ce9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/stringify.h> #include <linux/kprobes.h> #include <linux/mm.h> #include <linux/vmalloc.h> @@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly); #define smp_alt_once 1 #endif -static int debug_alternative; +static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { @@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str) __setup("noreplace-smp", setup_noreplace_smp); #ifdef CONFIG_PARAVIRT -static int noreplace_paravirt = 0; +static int __initdata_or_module noreplace_paravirt = 0; static int __init setup_noreplace_paravirt(char *str) { @@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) -#ifdef GENERIC_NOP1 +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ -asm("\t.section .rodata, \"a\"\nintelnops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8 "\t.previous"); extern const unsigned char intelnops[]; -static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, intelnops + 1, @@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #endif #ifdef K8_NOP1 -asm("\t.section .rodata, \"a\"\nk8nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 K8_NOP7 K8_NOP8 "\t.previous"); extern const unsigned char k8nops[]; -static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, k8nops + 1, @@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { }; #endif -#ifdef K7_NOP1 -asm("\t.section .rodata, \"a\"\nk7nops: " +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 K7_NOP7 K7_NOP8 "\t.previous"); extern const unsigned char k7nops[]; -static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, k7nops + 1, @@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #endif #ifdef P6_NOP1 -asm("\t.section .rodata, \"a\"\np6nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 P6_NOP7 P6_NOP8 "\t.previous"); extern const unsigned char p6nops[]; -static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +p6_nops[ASM_NOP_MAX+1] = { NULL, p6nops, p6nops + 1, @@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_has(X86_FEATURE_NOPL)) @@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void) #else /* CONFIG_X86_64 */ -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_has(X86_FEATURE_K8)) return k8_nops; @@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void) #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -void add_nops(void *insns, unsigned int len) +static void __init_or_module add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len) len -= noplen; } } -EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; +static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[]; APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; char insnbuf[MAX_PATCH_LEN]; @@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ -void alternatives_smp_module_add(struct module *mod, char *name, - void *locks, void *locks_end, - void *text, void *text_end) +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) { struct smp_alt_module *smp; @@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name, mutex_unlock(&smp_alt); } -void alternatives_smp_module_del(struct module *mod) +void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; @@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; @@ -485,13 +492,14 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *text_poke_early(void *addr, const void *opcode, size_t len) +static void *__init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6c99f503780..98f230f6a28 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -41,9 +41,13 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); -#ifdef CONFIG_IOMMU_API +/* + * Domain for untranslated devices - only allocated + * if iommu=pt passed on kernel cmd line. + */ +static struct protection_domain *pt_domain; + static struct iommu_ops amd_iommu_ops; -#endif /* * general struct to manage commands send to an IOMMU @@ -55,16 +59,16 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 - **pte_page, gfp_t gfp); +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, int end_lvl, + u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); - -#ifndef BUS_NOTIFY_UNBOUND_DRIVER -#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005 -#endif +static void reset_iommu_command_buffer(struct amd_iommu *iommu); +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size); +static void update_domain(struct protection_domain *domain); #ifdef CONFIG_AMD_IOMMU_STATS @@ -138,7 +142,25 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * ****************************************************************************/ -static void iommu_print_event(void *__evt) +static void dump_dte_entry(u16 devid) +{ + int i; + + for (i = 0; i < 8; ++i) + pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + amd_iommu_dev_table[devid].data[i]); +} + +static void dump_command(unsigned long phys_addr) +{ + struct iommu_cmd *cmd = phys_to_virt(phys_addr); + int i; + + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]); +} + +static void iommu_print_event(struct amd_iommu *iommu, void *__evt) { u32 *event = __evt; int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; @@ -147,7 +169,7 @@ static void iommu_print_event(void *__evt) int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; u64 address = (u64)(((u64)event[3]) << 32) | event[2]; - printk(KERN_ERR "AMD IOMMU: Event logged ["); + printk(KERN_ERR "AMD-Vi: Event logged ["); switch (type) { case EVENT_TYPE_ILL_DEV: @@ -155,6 +177,7 @@ static void iommu_print_event(void *__evt) "address=0x%016llx flags=0x%04x]\n", PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), address, flags); + dump_dte_entry(devid); break; case EVENT_TYPE_IO_FAULT: printk("IO_PAGE_FAULT device=%02x:%02x.%x " @@ -176,6 +199,8 @@ static void iommu_print_event(void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + reset_iommu_command_buffer(iommu); + dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: printk("COMMAND_HARDWARE_ERROR address=0x%016llx " @@ -209,7 +234,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); while (head != tail) { - iommu_print_event(iommu->evt_buf + head); + iommu_print_event(iommu, iommu->evt_buf + head); head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; } @@ -296,8 +321,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + if (unlikely(i == EXIT_LOOP_COUNT)) { + spin_unlock(&iommu->lock); + reset_iommu_command_buffer(iommu); + spin_lock(&iommu->lock); + } } /* @@ -445,47 +473,78 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) } /* + * This function flushes one domain on one IOMMU + */ +static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +{ + struct iommu_cmd cmd; + unsigned long flags; + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); +} + +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +{ + int i; + + for (i = 1; i < MAX_DOMAIN_ID; ++i) { + if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + continue; + flush_domain_on_iommu(iommu, i); + } + +} + +/* * This function is used to flush the IO/TLB for a given protection domain * on every IOMMU in the system */ static void iommu_flush_domain(u16 domid) { - unsigned long flags; struct amd_iommu *iommu; - struct iommu_cmd cmd; INC_STATS_COUNTER(domain_flush_all); - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - for_each_iommu(iommu) { - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); - } + for_each_iommu(iommu) + flush_domain_on_iommu(iommu, domid); } void amd_iommu_flush_all_domains(void) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + flush_all_domains_on_iommu(iommu); +} + +static void flush_all_devices_for_iommu(struct amd_iommu *iommu) +{ int i; - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (iommu != amd_iommu_rlookup_table[i]) continue; - iommu_flush_domain(i); + + iommu_queue_inv_dev_entry(iommu, i); + iommu_completion_wait(iommu); } } -void amd_iommu_flush_all_devices(void) +static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; int i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] == NULL) + if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || + (amd_iommu_pd_table[i] != domain)) continue; iommu = amd_iommu_rlookup_table[i]; @@ -497,6 +556,27 @@ void amd_iommu_flush_all_devices(void) } } +static void reset_iommu_command_buffer(struct amd_iommu *iommu) +{ + pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); + + if (iommu->reset_in_progress) + panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + + iommu->reset_in_progress = true; + + amd_iommu_reset_cmd_buffer(iommu); + flush_all_devices_for_iommu(iommu); + flush_all_domains_on_iommu(iommu); + + iommu->reset_in_progress = false; +} + +void amd_iommu_flush_all_devices(void) +{ + flush_devices_by_domain(NULL); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -514,18 +594,21 @@ void amd_iommu_flush_all_devices(void) static int iommu_map_page(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, - int prot) + int prot, + int map_size) { u64 __pte, *pte; bus_addr = PAGE_ALIGN(bus_addr); phys_addr = PAGE_ALIGN(phys_addr); - /* only support 512GB address spaces for now */ - if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + BUG_ON(!PM_ALIGNED(map_size, bus_addr)); + BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + + if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL); + pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); if (IOMMU_PTE_PRESENT(*pte)) return -EBUSY; @@ -538,29 +621,18 @@ static int iommu_map_page(struct protection_domain *dom, *pte = __pte; + update_domain(dom); + return 0; } static void iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr) + unsigned long bus_addr, int map_size) { - u64 *pte; - - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + u64 *pte = fetch_pte(dom, bus_addr, map_size); - if (!IOMMU_PTE_PRESENT(*pte)) - return; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; - - *pte = 0; + if (pte) + *pte = 0; } /* @@ -615,7 +687,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, + PM_MAP_4k); if (ret) return ret; /* @@ -670,24 +743,29 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * This function checks if there is a PTE for a given dma address. If * there is one, it returns the pointer to it. */ -static u64* fetch_pte(struct protection_domain *domain, - unsigned long address) +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) { + int level; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + level -= 1; - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } return pte; } @@ -727,7 +805,7 @@ static int alloc_new_range(struct amd_iommu *iommu, u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, + pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, &pte_page, gfp); if (!pte) goto out_free; @@ -760,16 +838,20 @@ static int alloc_new_range(struct amd_iommu *iommu, for (i = dma_dom->aperture[index]->offset; i < dma_dom->aperture_size; i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i); + u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); } + update_domain(&dma_dom->domain); + return 0; out_free: + update_domain(&dma_dom->domain); + free_page((unsigned long)dma_dom->aperture[index]->bitmap); kfree(dma_dom->aperture[index]); @@ -1009,7 +1091,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; - dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; @@ -1063,6 +1145,41 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +static void set_dte_entry(u16 devid, struct protection_domain *domain) +{ + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + + amd_iommu_pd_table[devid] = domain; +} + +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ +static void __attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) +{ + /* lock domain */ + spin_lock(&domain->lock); + + /* update DTE entry */ + set_dte_entry(devid, domain); + + domain->dev_cnt += 1; + + /* ready */ + spin_unlock(&domain->lock); +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1072,27 +1189,16 @@ static void attach_device(struct amd_iommu *iommu, u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); - - domain->dev_cnt += 1; - - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) - << DEV_ENTRY_MODE_SHIFT; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[2] = domain->id; - - amd_iommu_pd_table[devid] = domain; + __attach_device(iommu, domain, devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - /* - * We might boot into a crash-kernel here. The crashed kernel - * left the caches in the IOMMU dirty. So we have to flush - * here to evict all dirty stuff. - */ + /* + * We might boot into a crash-kernel here. The crashed kernel + * left the caches in the IOMMU dirty. So we have to flush + * here to evict all dirty stuff. + */ iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(iommu, domain->id); } @@ -1119,6 +1225,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid) /* ready */ spin_unlock(&domain->lock); + + /* + * If we run in passthrough mode the device must be assigned to the + * passthrough domain if it is detached from any other domain + */ + if (iommu_pass_through) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + __attach_device(iommu, pt_domain, devid); + } } /* @@ -1164,6 +1279,8 @@ static int device_change_notifier(struct notifier_block *nb, case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) goto out; + if (iommu_pass_through) + break; detach_device(domain, devid); break; case BUS_NOTIFY_ADD_DEVICE: @@ -1292,39 +1409,91 @@ static int get_device_resources(struct device *dev, return 1; } +static void update_device_table(struct protection_domain *domain) +{ + unsigned long flags; + int i; + + for (i = 0; i <= amd_iommu_last_bdf; ++i) { + if (amd_iommu_pd_table[i] != domain) + continue; + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + set_dte_entry(i, domain); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + } +} + +static void update_domain(struct protection_domain *domain) +{ + if (!domain->updated) + return; + + update_device_table(domain); + flush_devices_by_domain(domain); + iommu_flush_domain(domain->id); + + domain->updated = false; +} + /* - * If the pte_page is not yet allocated this function is called + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. */ -static u64* alloc_pte(struct protection_domain *dom, - unsigned long address, u64 **pte_page, gfp_t gfp) +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) { u64 *pte, *page; + int level; - pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)]; + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L2_PDE(virt_to_phys(page)); - } + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(address)]; + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = IOMMU_L1_PDE(virt_to_phys(page)); - } + level -= 1; - pte = IOMMU_PTE_PAGE(*pte); + pte = IOMMU_PTE_PAGE(*pte); - if (pte_page) - *pte_page = pte; + if (pte_page && level == end_lvl) + *pte_page = pte; - pte = &pte[IOMMU_PTE_L0_INDEX(address)]; + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } return pte; } @@ -1344,10 +1513,13 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC); + pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); + + update_domain(&dom->domain); return pte; } @@ -1409,7 +1581,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, if (!pte) return; - pte += IOMMU_PTE_L0_INDEX(address); + pte += PM_LEVEL_INDEX(0, address); WARN_ON(!*pte); @@ -1988,19 +2160,47 @@ static void cleanup_domain(struct protection_domain *domain) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -static int amd_iommu_domain_init(struct iommu_domain *dom) +static void protection_domain_free(struct protection_domain *domain) +{ + if (!domain) + return; + + if (domain->id) + domain_id_free(domain->id); + + kfree(domain); +} + +static struct protection_domain *protection_domain_alloc(void) { struct protection_domain *domain; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) - return -ENOMEM; + return NULL; spin_lock_init(&domain->lock); - domain->mode = PAGE_MODE_3_LEVEL; domain->id = domain_id_alloc(); if (!domain->id) + goto out_err; + + return domain; + +out_err: + kfree(domain); + + return NULL; +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = protection_domain_alloc(); + if (!domain) goto out_free; + + domain->mode = PAGE_MODE_3_LEVEL; domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); if (!domain->pt_root) goto out_free; @@ -2010,7 +2210,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) return 0; out_free: - kfree(domain); + protection_domain_free(domain); return -ENOMEM; } @@ -2115,7 +2315,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr &= PAGE_MASK; for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot); + ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) return ret; @@ -2136,7 +2336,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; for (i = 0; i < npages; ++i) { - iommu_unmap_page(domain, iova); + iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } @@ -2151,21 +2351,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte; - pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; - - if (!IOMMU_PTE_PRESENT(*pte)) - return 0; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + pte = fetch_pte(domain, iova, PM_MAP_4k); - if (!IOMMU_PTE_PRESENT(*pte)) + if (!pte || !IOMMU_PTE_PRESENT(*pte)) return 0; paddr = *pte & IOMMU_PAGE_MASK; @@ -2191,3 +2379,46 @@ static struct iommu_ops amd_iommu_ops = { .domain_has_cap = amd_iommu_domain_has_cap, }; +/***************************************************************************** + * + * The next functions do a basic initialization of IOMMU for pass through + * mode + * + * In passthrough mode the IOMMU is initialized and enabled but not used for + * DMA-API translation. + * + *****************************************************************************/ + +int __init amd_iommu_init_passthrough(void) +{ + struct pci_dev *dev = NULL; + u16 devid, devid2; + + /* allocate passthroug domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode |= PAGE_MODE_NONE; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + struct amd_iommu *iommu; + + devid = calc_devid(dev->bus->number, dev->devfn); + if (devid > amd_iommu_last_bdf) + continue; + + devid2 = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid2]; + if (!iommu) + continue; + + __attach_device(iommu, pt_domain, devid); + __attach_device(iommu, pt_domain, devid2); + } + + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); + + return 0; +} diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c1b17e97252..b4b61d462dc 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); @@ -435,6 +435,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) } /* + * This function resets the command buffer if the IOMMU stopped fetching + * commands from it. + */ +void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu) +{ + iommu_feature_disable(iommu, CONTROL_CMDBUF_EN); + + writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); +} + +/* * This function writes the command buffer address to the hardware and * enables it. */ @@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, &entry, sizeof(entry)); - /* set head and tail to zero manually */ - writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - - iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + amd_iommu_reset_cmd_buffer(iommu); } static void __init free_command_buffer(struct amd_iommu *iommu) @@ -858,7 +868,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) switch (*p) { case ACPI_IVHD_TYPE: - DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x " + DUMP_printk("device: %02x:%02x.%01x cap: %04x " "seg: %d flags: %01x info %04x\n", PCI_BUS(h->devid), PCI_SLOT(h->devid), PCI_FUNC(h->devid), h->cap_ptr, @@ -902,7 +912,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu) r = request_irq(iommu->dev->irq, amd_iommu_int_handler, IRQF_SAMPLE_RANDOM, - "AMD IOMMU", + "AMD-Vi", NULL); if (r) { @@ -1150,7 +1160,7 @@ int __init amd_iommu_init(void) if (no_iommu) { - printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); return 0; } @@ -1242,22 +1252,28 @@ int __init amd_iommu_init(void) if (ret) goto free; - ret = amd_iommu_init_dma_ops(); + if (iommu_pass_through) + ret = amd_iommu_init_passthrough(); + else + ret = amd_iommu_init_dma_ops(); if (ret) goto free; enable_iommus(); - printk(KERN_INFO "AMD IOMMU: device isolation "); + if (iommu_pass_through) + goto out; + + printk(KERN_INFO "AMD-Vi: device isolation "); if (amd_iommu_isolate) printk("enabled\n"); else printk("disabled\n"); if (amd_iommu_unmap_flush) - printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else - printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); out: return ret; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> +#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..159740decc4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -49,6 +49,7 @@ #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/kvm_para.h> unsigned int num_processors; @@ -1361,52 +1362,80 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP - int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; - - ret = dmar_table_init(); - if (ret) { - pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; - } - if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + int dmar_table_init_ret = 0; + +#ifdef CONFIG_INTR_REMAP + dmar_table_init_ret = dmar_table_init(); + if (dmar_table_init_ret) + pr_debug("dmar_table_init() failed with %d:\n", + dmar_table_init_ret); +#endif + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + if (dmar_table_init_ret) + ret = 0; + else + ret = enable_IR(); - pr_info("Enabled Interrupt-remapping\n"); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } + + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1414,41 +1443,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1549,8 +1562,6 @@ no_apic: #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { - unsigned long phys_addr; - /* * If no local APIC can be found then go out * : it means there is no mpatable and MADT @@ -1558,11 +1569,9 @@ void __init early_init_lapic_mapping(void) if (!smp_found_config) return; - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); + APIC_BASE, mp_lapic_addr); /* * Fetch the APIC ID of the BSP in case we have a @@ -1651,7 +1660,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8952a589028..89174f847b4 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void) { /* MPENTIUMIII */ if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) + (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) return 1; return 0; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc8..3c8f9e75d03 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include <asm/apic.h> #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -85,6 +87,9 @@ int nr_ioapic_registers[MAX_IO_APICS]; struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; +/* IO APIC gsi routing info */ +struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; @@ -116,15 +121,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -139,6 +135,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; @@ -414,13 +415,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -428,9 +426,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -498,72 +493,68 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **last, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; + return 0; + last = &entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } entry->apic = apic; entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; + struct irq_pin_list *entry; - while (entry) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } - entry = entry->next; } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -580,7 +571,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -596,11 +586,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { @@ -613,7 +598,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -1702,12 +1686,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -2211,7 +2191,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2224,14 +2203,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -2269,13 +2240,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2288,9 +2255,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } @@ -2515,11 +2479,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2532,31 +2493,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2598,7 +2556,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2606,26 +2564,15 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void @@ -3241,8 +3188,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); @@ -3912,7 +3858,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); @@ -3941,11 +3891,28 @@ int io_apic_set_pci_routing(struct device *dev, int irq, return __io_apic_set_pci_routing(dev, irq, irq_attr); } -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ +u8 __init io_apic_unique_id(u8 id) +{ +#ifdef CONFIG_X86_32 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +#else + int i; + DECLARE_BITMAP(used, 256); -#ifdef CONFIG_ACPI + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +#endif +} #ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) @@ -4054,8 +4021,6 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) return 0; } -#endif /* CONFIG_ACPI */ - /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online @@ -4109,7 +4074,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4125,15 +4090,13 @@ static struct resource * __init ioapic_setup_resources(void) mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4147,7 +4110,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4176,11 +4139,9 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } @@ -4201,3 +4162,76 @@ void __init ioapic_insert_resources(void) r++; } } + +int mp_find_ioapic(int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_gsi_routing[i].gsi_base) + && (gsi <= mp_gsi_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, int gsi) +{ + if (WARN_ON(ioapic == -1)) + return -1; + if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + return -1; + + return gsi - mp_gsi_routing[ioapic].gsi_base; +} + +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + mp_ioapics[idx].type = MP_IOAPIC; + mp_ioapics[idx].flags = MPC_APIC_USABLE; + mp_ioapics[idx].apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].apicid = io_apic_unique_id(id); + mp_ioapics[idx].apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_gsi_routing[idx].gsi_base = gsi_base; + mp_gsi_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, + mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, + mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + + nr_ioapics++; +} diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..db7220220d0 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -552,14 +554,18 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index fcec2f1d34a..65edc180fc8 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -55,11 +55,11 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 442b5508893..151ace69a5a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2332,15 +2340,6 @@ static int __init apm_init(void) pm_flags |= PM_APM; /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); - - /* * Set up the long jump entry point to the APM BIOS, which is called * from inline assembly. */ @@ -2358,12 +2357,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc47e12..4a6aeedcd96 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -3,6 +3,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include <linux/crypto.h> #include <linux/sched.h> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 83b217c7225..f32fa71ccf9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -253,6 +253,64 @@ static int __cpuinit nearby_node(int apicid) #endif /* + * Fixup core topology information for AMD multi-node processors. + * Assumption 1: Number of cores in each internal node is the same. + * Assumption 2: Mixed systems with both single-node and dual-node + * processors are not supported. + */ +#ifdef CONFIG_X86_HT +static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_PCI + u32 t, cpn; + u8 n, n_id; + int cpu = smp_processor_id(); + + /* fixup topology information only once for a core */ + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return; + + /* check for multi-node processor on boot cpu */ + t = read_pci_config(0, 24, 3, 0xe8); + if (!(t & (1 << 29))) + return; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + + /* cores per node: each internal node has half the number of cores */ + cpn = c->x86_max_cores >> 1; + + /* even-numbered NB_id of this dual-node processor */ + n = c->phys_proc_id << 1; + + /* + * determine internal node id and assign cores fifty-fifty to + * each node of the dual-node processor + */ + t = read_pci_config(0, 24 + n, 3, 0xe8); + n = (t>>30) & 0x3; + if (n == 0) { + if (c->cpu_core_id < cpn) + n_id = 0; + else + n_id = 1; + } else { + if (c->cpu_core_id < cpn) + n_id = 1; + else + n_id = 0; + } + + /* compute entire NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; + + /* fixup core id to be in range from 0 to cpn */ + c->cpu_core_id = c->cpu_core_id % cpn; +#endif +} +#endif + +/* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ @@ -269,17 +327,31 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + /* fixup topology information on multi-node processors */ + if ((c->x86 == 0x10) && (c->x86_model == 9)) + amd_fixup_dcm(c); #endif } +int amd_get_nb_id(int cpu) +{ + int id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; - unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; + unsigned apicid = c->apicid; + + node = per_cpu(cpu_llc_id, cpu); - node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; if (!node_online(node)) { @@ -406,12 +478,24 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* * Some BIOSes incorrectly force this feature, but only K8 * revision D (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14) + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + u64 val; + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &val)) { + val &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, val); + } + } + } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); #else /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 734eaad9365..2055fc2b2e6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -94,45 +94,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; @@ -987,13 +987,21 @@ struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1008,8 +1016,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) @@ -1042,8 +1049,11 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, stack_canary); +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Make sure %fs and %gs are initialized properly in idle threads */ diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include <asm/apic.h> #include <asm/desc.h> -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 306bf0dca06..804c40e2bc3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -241,7 +241,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 0: if (!l1->val) return; - assoc = l1->assoc; + assoc = assocs[l1->assoc]; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; @@ -249,7 +249,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 2: if (!l2.val) return; - assoc = l2.assoc; + assoc = assocs[l2.assoc]; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ @@ -258,10 +258,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, case 3: if (!l3.val) return; - assoc = l3.assoc; + assoc = assocs[l3.assoc]; line_size = l3.line_size; lines_per_tag = l3.lines_per_tag; size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } break; default: return; @@ -270,18 +274,14 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.is_self_initializing = 1; eax->split.type = types[leaf]; eax->split.level = levels[leaf]; - if (leaf == 3) - eax->split.num_threads_sharing = - current_cpu_data.x86_max_cores - 1; - else - eax->split.num_threads_sharing = 0; + eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - if (assoc == 0xf) + if (assoc == 0xffff) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assocs[assoc] - 1; + ebx->split.ways_of_associativity = assoc - 1; ebx->split.physical_line_partition = lines_per_tag - 1; ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / (ebx->split.ways_of_associativity + 1) - 1; @@ -523,6 +523,18 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); + if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { + struct cpuinfo_x86 *d; + for_each_online_cpu(i) { + if (!per_cpu(cpuid4_info, i)) + continue; + d = &cpu_data(i); + this_leaf = CPUID4_INFO_IDX(i, index); + cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), + d->llc_shared_map); + } + return; + } this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 01213048f62..fdd51b55435 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -183,6 +183,11 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void __weak decode_mce(struct mce *m) +{ + return; +} + static void print_mce(struct mce *m) { printk(KERN_EMERG @@ -205,6 +210,8 @@ static void print_mce(struct mce *m) printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + + decode_mce(m); } static void print_mce_head(void) @@ -215,7 +222,10 @@ static void print_mce_head(void) static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) + "Run through mcelog --ascii to decode and contact your hardware vendor\n" +#endif + ); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -1091,7 +1101,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1120,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1325,7 +1335,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1925,7 +1935,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bd..8cd5224943b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { @@ -489,12 +489,14 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_core_mask(cpu)); + i = cpumask_first(c->llc_shared_map); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -514,7 +516,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); per_cpu(threshold_banks, cpu)[bank] = b; goto out; @@ -539,7 +541,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, cpu_core_mask(cpu)); + cpumask_copy(b->cpus, c->llc_shared_map); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 900332b800f..2732e2c1e4d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * * For licencing details see kernel-base/COPYING */ @@ -20,6 +21,7 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/highmem.h> +#include <linux/cpu.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -27,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -58,6 +100,8 @@ struct x86_pmu { int apic; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -577,6 +621,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -666,10 +713,110 @@ static void release_pmc_hardware(void) #endif } +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -712,6 +859,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -728,9 +911,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -793,6 +980,20 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (config == -1LL) return -EINVAL; + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + hwc->config |= config; return 0; @@ -817,7 +1018,18 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -875,7 +1087,25 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -962,6 +1192,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -976,7 +1211,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -990,6 +1225,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -1015,7 +1253,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1072,6 +1310,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -1093,11 +1339,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + if (!x86_pmu.num_counters_fixed) return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1118,7 +1369,15 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1211,7 +1470,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1229,6 +1488,44 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + struct bts_record *at, *top; + + if (!counter) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + ds->bts_index = ds->bts_buffer_base; + + for (; at < top; at++) { + data->regs->ip = at->from; + data->addr = at->to; + + perf_counter_output(counter, 1, data); + } + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1253,6 +1550,15 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1280,6 +1586,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1297,6 +1604,8 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1362,6 +1671,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1571,6 +1881,8 @@ static struct x86_pmu intel_pmu = { * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1798,8 +2110,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static DEFINE_PER_CPU(int, in_nmi_frame); @@ -1952,9 +2264,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; @@ -1962,3 +2274,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 1e904346bbf..62ac8cb6ba2 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d..37250fe490b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5cb5725b2ba..147005a1cc3 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be74510..d59fe323807 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller) END(ftrace_graph_caller) GLOBAL(return_to_handler) - subq $80, %rsp + subq $24, %rsp /* Save the return values */ movq %rax, (%rsp) @@ -155,10 +155,10 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 72(%rsp) + movq %rax, 16(%rsp) movq 8(%rsp), %rdx movq (%rsp), %rax - addq $72, %rsp + addq $16, %rsp retq #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9f..9dbb527e165 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; @@ -498,37 +494,56 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; } -void arch_init_ftrace_syscalls(void) +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < NR_syscalls; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index cc827ac9e8d..7ffec6b3b33 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -439,7 +439,6 @@ is386: movl $2,%ecx # set MP jne 1f movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx - subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c664d515f61..63b0ec8d3d4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,7 +34,6 @@ struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; - enum paravirt_lazy_mode mode; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); @@ -77,7 +76,7 @@ static void kvm_deferred_mmu_op(void *buffer, int len) { struct kvm_para_state *state = kvm_para_state(); - if (state->mode != PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { kvm_mmu_op(buffer, len); return; } @@ -185,10 +184,7 @@ static void kvm_release_pt(unsigned long pfn) static void kvm_enter_lazy_mmu(void) { - struct kvm_para_state *state = kvm_para_state(); - paravirt_enter_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void kvm_leave_lazy_mmu(void) @@ -197,7 +193,6 @@ static void kvm_leave_lazy_mmu(void) mmu_queue_flush(state); paravirt_leave_lazy_mmu(); - state->mode = paravirt_get_lazy_mode(); } static void __init paravirt_ops_setup(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..e5efcdcca31 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -50,8 +50,8 @@ static unsigned long kvm_get_wallclock(void) struct timespec ts; int low, high; - low = (int)__pa(&wall_clock); - high = ((u64)__pa(&wall_clock) >> 32); + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); native_write_msr(MSR_KVM_WALL_CLOCK, low, high); vcpu_time = &get_cpu_var(hv_clock); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..fcd513bf284 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -482,11 +482,11 @@ static void __init construct_ioapic_table(int mpc_default_type) MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 98fd6cd4e3a..7dd95009417 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,7 @@ /* ----------------------------------------------------------------------- * * * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -80,11 +81,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } if (copy_to_user(tmp, &data, 8)) { err = -EFAULT; break; @@ -115,11 +113,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf, break; } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } tmp += 2; bytes += 8; } @@ -127,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf, return bytes ? bytes : err; } +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); @@ -157,6 +200,8 @@ static const struct file_operations msr_fops = { .read = msr_read, .write = msr_write, .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, }; static int __cpuinit msr_device_create(int cpu) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..f5b0b4a01fb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -362,8 +362,9 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .read_msr_amd = native_read_msr_amd_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..64b838eac18 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include <linux/dmar.h> #include <linux/bootmem.h> #include <linux/pci.h> +#include <linux/kmemleak.h> #include <asm/proto.h> #include <asm/dma.h> @@ -32,7 +33,14 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -int iommu_pass_through; +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA ranslation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user want to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); @@ -88,6 +96,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else @@ -147,7 +160,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { @@ -212,10 +225,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e..98a827ee9ed 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f3..a3933d4330c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, }; void __init no_iommu_init(void) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee4420..e8a35016115 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,31 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984a..4cf79567cda 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ @@ -350,14 +347,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +402,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +416,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9..ad535b68317 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -386,9 +383,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -419,6 +424,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +471,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ @@ -480,15 +489,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c1..8d7d5c9c1be 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,10 +35,11 @@ #include <asm/proto.h> #include <asm/ds.h> -#include <trace/syscall.h> - #include "tls.h" +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1497,8 +1498,8 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1523,8 +1524,8 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index af71d06624b..6c3b2c6fd77 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) pci_read_config_dword(nb_ht, 0x60, &val); set_dev_node(&dev->dev, val & 7); - pci_dev_put(dev); + pci_dev_put(nb_ht); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a06e8d10184..27349f92a6d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,7 @@ #include <linux/pm.h> #include <linux/efi.h> #include <linux/dmi.h> +#include <linux/tboot.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -508,6 +509,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -634,6 +637,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -645,6 +650,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index eb1f1e6e52b..19f15c4076f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include <linux/percpu.h> #include <linux/crash_dump.h> +#include <linux/tboot.h> #include <video/edid.h> @@ -987,6 +988,8 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); + tboot_probe(); + #ifdef CONFIG_X86_64 map_vsyscall(); #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 07d81916f21..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Helpers for first chunk memory allocation */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + return pcpu_alloc_bootmem(cpu, size, align); } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static void __init pcpu_fc_free(void *ptr, size_t size) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_cpu_ids * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = nr_cpu_ids * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < nr_cpu_ids - 1; i++) - for (j = i + 1; j < nr_cpu_ids; j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; + free_bootmem(__pa(ptr), size); } -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = nr_cpu_ids - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; -} +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } -/* - * 4k page allocator - * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) -{ - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; -} - -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ - ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) - ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); - else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", - pcpu_chosen_alloc, ret); - } - } else { - ret = setup_pcpu_lpage(static_size, false); - if (ret < 0) - ret = setup_pcpu_embed(static_size, false); +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } - if (ret < 0) - ret = setup_pcpu_4k(static_size); - if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..81e58238c4c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d720b7e0cf3..a25eeec0008 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -47,6 +47,7 @@ #include <linux/bootmem.h> #include <linux/err.h> #include <linux/nmi.h> +#include <linux/tboot.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -434,7 +435,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if (sched_mc_power_savings || sched_smt_power_savings) + if ((sched_mc_power_savings || sched_smt_power_savings) && + !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return c->llc_shared_map; @@ -1331,6 +1333,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c..3149032ff10 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/ptrace.h> +#include <asm/desc.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211accf0..45e00eb09c3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,9 +18,9 @@ #include <asm/ia32.h> #include <asm/syscalls.h> -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { long error; struct file *file; @@ -226,7 +226,7 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user *name) +SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) { int err; down_read(&uts_sem); diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..86c9f91b48a --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,447 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> + +#include <asm/trampoline.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier __cpuinitdata = +{ + .notifier_call = tboot_cpu_callback, +}; + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 77b9689f8ed..503c1f2e883 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -640,13 +640,13 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, + &proc_uv_ptc_operations); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); return -EINVAL; } - proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; return 0; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e4b1f5dec8..83264922a87 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -786,33 +786,34 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; } -#endif -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) { + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } /* @@ -846,17 +847,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9fc178255c0..0ccb57d5ee3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -348,15 +348,12 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8600a09e0c6..b84e571f417 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -1,12 +1,8 @@ # # KVM configuration # -config HAVE_KVM - bool -config HAVE_KVM_IRQCHIP - bool - default y +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -29,6 +25,9 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES + select HAVE_KVM_IRQCHIP + select HAVE_KVM_EVENTFD + select KVM_APIC_ARCHITECTURE ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -63,18 +62,6 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. -config KVM_TRACE - bool "KVM trace support" - depends on KVM && SYSFS - select MARKERS - select RELAY - select DEBUG_FS - default n - ---help--- - This option allows reading a trace of kvm-related events through - relayfs. Note the ABI is not considered stable and will be - modified in future updates. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index b43c4efafe8..0e7fe78d0f7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,22 +1,19 @@ -# -# Makefile for Kernel-based Virtual Machine module -# - -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o irq_comm.o) -ifeq ($(CONFIG_KVM_TRACE),y) -common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) -endif -ifeq ($(CONFIG_IOMMU_API),y) -common-objs += $(addprefix ../../../virt/kvm/, iommu.o) -endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm -kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o -obj-$(CONFIG_KVM) += kvm.o -kvm-intel-objs = vmx.o -obj-$(CONFIG_KVM_INTEL) += kvm-intel.o -kvm-amd-objs = svm.o -obj-$(CONFIG_KVM_AMD) += kvm-amd.o +CFLAGS_x86.o := -I. +CFLAGS_svm.o := -I. +CFLAGS_vmx.o := -I. + +kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ + coalesced_mmio.o irq_comm.o eventfd.o) +kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) + +kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ + i8254.o timer.o +kvm-intel-y += vmx.o +kvm-amd-y += svm.o + +obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_INTEL) += kvm-intel.o +obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c index 616de4628d6..1be5cd640e9 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1,5 +1,5 @@ /****************************************************************************** - * x86_emulate.c + * emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * @@ -30,7 +30,9 @@ #define DPRINTF(x...) do {} while (0) #endif #include <linux/module.h> -#include <asm/kvm_x86_emulate.h> +#include <asm/kvm_emulate.h> + +#include "mmu.h" /* for is_long_mode() */ /* * Opcode effective-address decode tables. @@ -60,6 +62,7 @@ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -97,11 +100,11 @@ static u32 opcode_table[256] = { /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -195,7 +198,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImm | Src2Imm16, SrcImmByte | ImplicitOps, + SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ @@ -208,7 +211,7 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -216,7 +219,9 @@ static u32 twobyte_table[256] = { ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ImplicitOps, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -319,8 +324,11 @@ static u32 group2_table[] = { }; /* EFLAGS bit definitions. */ +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -1027,6 +1035,7 @@ done_prefixes: c->src.type = OP_MEM; break; case SrcImm: + case SrcImmU: c->src.type = OP_IMM; c->src.ptr = (unsigned long *)c->eip; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; @@ -1044,6 +1053,19 @@ done_prefixes: c->src.val = insn_fetch(s32, 4, c->eip); break; } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } break; case SrcImmByte: case SrcImmUByte: @@ -1375,6 +1397,217 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) ctxt->interruptibility = mask; } +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct kvm_segment *cs, struct kvm_segment *ss) +{ + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); + + cs->l = 0; /* will be adjusted later */ + cs->base = 0; /* flat segment */ + cs->g = 1; /* 4kb granularity */ + cs->limit = 0xffffffff; /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->present = 1; + cs->db = 1; + + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->db = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->present = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* syscall is not available in real mode */ + if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return 0; +} + +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL || + !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return 0; +} + +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + int usermode; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* sysexit must be called from CPL 0 */ + if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + setup_syscalls_segments(ctxt, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = cs.selector + 8; + cs.db = 0; + cs.l = 1; + break; + } + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + + return 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1970,6 +2203,12 @@ twobyte_insn: goto cannot_emulate; } break; + case 0x05: /* syscall */ + if (emulate_syscall(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; case 0x06: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; @@ -2036,6 +2275,18 @@ twobyte_insn: rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; break; + case 0x34: /* sysenter */ + if (emulate_sysenter(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x35: /* sysexit */ + if (emulate_sysexit(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; if (!test_cc(c->b, ctxt->eflags)) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 21f68e00524..82ad523b490 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -231,7 +231,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) + if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); return 0; } @@ -252,7 +252,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (vcpu->vcpu_id != 0 || !pit) + if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; timer = &pit->pit_state.pit_timer.timer; @@ -294,7 +294,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) pt->timer.function = kvm_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu_id = 0; + pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -332,33 +332,62 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(ps, val, 0); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { + create_pit_timer(ps, val, 0); + } break; case 2: case 3: - create_pit_timer(ps, val, 1); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ + create_pit_timer(ps, val, 1); + } break; default: destroy_pit_timer(&ps->pit_timer); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) +{ + u8 saved_mode; + if (hpet_legacy_start) { + /* save existing mode for later reenablement */ + saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; + kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(kvm, channel, val); + kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + } else { + pit_load_count(kvm, channel, val); + } +} + +static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pit, dev); +} + +static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); - pit_load_count(kvm, channel, val); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return container_of(dev, struct kvm_pit, speaker_dev); } -static void pit_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static inline int pit_in_range(gpa_t addr) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + return ((addr >= KVM_PIT_BASE_ADDRESS) && + (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); +} + +static int pit_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; val &= 0xff; addr &= KVM_PIT_CHANNEL_MASK; @@ -421,16 +450,19 @@ static void pit_ioport_write(struct kvm_io_device *this, } mutex_unlock(&pit_state->lock); + return 0; } -static void pit_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int pit_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; int ret, count; struct kvm_kpit_channel_state *s; + if (!pit_in_range(addr)) + return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; s = &pit_state->channels[addr]; @@ -485,37 +517,36 @@ static void pit_ioport_read(struct kvm_io_device *this, memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); + return 0; } -static int pit_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return ((addr >= KVM_PIT_BASE_ADDRESS) && - (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); -} - -static void speaker_ioport_write(struct kvm_io_device *this, - gpa_t addr, int len, const void *data) +static int speaker_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; mutex_lock(&pit_state->lock); pit_state->speaker_data_on = (val >> 1) & 1; pit_set_gate(kvm, 2, val & 1); mutex_unlock(&pit_state->lock); + return 0; } -static void speaker_ioport_read(struct kvm_io_device *this, - gpa_t addr, int len, void *data) +static int speaker_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) { - struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; struct kvm *kvm = pit->kvm; unsigned int refresh_clock; int ret; + if (addr != KVM_SPEAKER_BASE_ADDRESS) + return -EOPNOTSUPP; /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; @@ -527,12 +558,7 @@ static void speaker_ioport_read(struct kvm_io_device *this, len = sizeof(ret); memcpy(data, (char *)&ret, len); mutex_unlock(&pit_state->lock); -} - -static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) -{ - return (addr == KVM_SPEAKER_BASE_ADDRESS); + return 0; } void kvm_pit_reset(struct kvm_pit *pit) @@ -541,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) struct kvm_kpit_channel_state *c; mutex_lock(&pit->pit_state.lock); + pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; @@ -563,10 +590,22 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) } } -struct kvm_pit *kvm_create_pit(struct kvm *kvm) +static const struct kvm_io_device_ops pit_dev_ops = { + .read = pit_ioport_read, + .write = pit_ioport_write, +}; + +static const struct kvm_io_device_ops speaker_dev_ops = { + .read = speaker_ioport_read, + .write = speaker_ioport_write, +}; + +/* Caller must have writers lock on slots_lock */ +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); if (!pit) @@ -582,19 +621,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - /* Initialize PIO device */ - pit->dev.read = pit_ioport_read; - pit->dev.write = pit_ioport_write; - pit->dev.in_range = pit_in_range; - pit->dev.private = pit; - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); - - pit->speaker_dev.read = speaker_ioport_read; - pit->speaker_dev.write = speaker_ioport_write; - pit->speaker_dev.in_range = speaker_in_range; - pit->speaker_dev.private = pit; - kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); - kvm->arch.vpit = pit; pit->kvm = kvm; @@ -613,7 +639,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit->mask_notifier.func = pit_mask_notifer; kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_iodevice_init(&pit->dev, &pit_dev_ops); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + if (ret < 0) + goto fail; + + if (flags & KVM_PIT_SPEAKER_DUMMY) { + kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); + ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + &pit->speaker_dev); + if (ret < 0) + goto fail_unregister; + } + return pit; + +fail_unregister: + __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + +fail: + if (pit->irq_source_id >= 0) + kvm_free_irq_source_id(kvm, pit->irq_source_id); + + kfree(pit); + return NULL; } void kvm_free_pit(struct kvm *kvm) @@ -623,6 +672,8 @@ void kvm_free_pit(struct kvm *kvm) if (kvm->arch.vpit) { kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); + kvm_unregister_irq_ack_notifier(kvm, + &kvm->arch.vpit->pit_state.irq_ack_notifier); mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); @@ -637,10 +688,10 @@ static void __inject_pit_timer_intr(struct kvm *kvm) struct kvm_vcpu *vcpu; int i; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); /* * Provides NMI watchdog support via Virtual Wire mode. @@ -652,11 +703,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm) * VCPU0, and only if its LVT0 is in EXTINT mode. */ if (kvm->arch.vapics_in_nmi_mode > 0) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (vcpu) - kvm_apic_nmi_wd_deliver(vcpu); - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) @@ -665,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_kpit_state *ps; - if (vcpu && pit) { + if (pit) { int inject = 0; ps = &pit->pit_state; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index bbd863ff60b..d4c1c7ffdc0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; + u32 flags; struct kvm_timer pit_timer; bool is_periodic; u32 speaker_data_on; @@ -49,8 +50,8 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); -struct kvm_pit *kvm_create_pit(struct kvm *kvm); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); +struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); void kvm_pit_reset(struct kvm_pit *pit); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 1ccb50c74f1..01f15168280 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,50 +30,24 @@ #include "irq.h" #include <linux/kvm_host.h> - -static void pic_lock(struct kvm_pic *s) - __acquires(&s->lock) -{ - spin_lock(&s->lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(&s->lock) -{ - struct kvm *kvm = s->kvm; - unsigned acks = s->pending_acks; - bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu; - - s->pending_acks = 0; - s->wakeup_needed = false; - - spin_unlock(&s->lock); - - while (acks) { - kvm_notify_acked_irq(kvm, SELECT_PIC(__ffs(acks)), - __ffs(acks)); - acks &= acks - 1; - } - - if (wakeup) { - vcpu = s->kvm->vcpus[0]; - if (vcpu) - kvm_vcpu_kick(vcpu); - } -} +#include "trace.h" static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); s->isr_ack |= (1 << irq); + if (s != &s->pics_state->pics[0]) + irq += 8; + kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); + spin_lock(&s->lock); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; + spin_unlock(&s->lock); } /* @@ -174,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - pic_lock(s); + spin_lock(&s->lock); pic_update_irq(s); - pic_unlock(s); + spin_unlock(&s->lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -184,12 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - pic_lock(s); + spin_lock(&s->lock); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); + trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, + s->pics[irq >> 3].imr, ret == 0); } - pic_unlock(s); + spin_unlock(&s->lock); return ret; } @@ -217,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(&s->lock); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -242,8 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - pic_unlock(s); - kvm_notify_acked_irq(kvm, SELECT_PIC(irq), irq); + spin_unlock(&s->lock); return intno; } @@ -252,7 +227,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; + struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; if (s == &s->pics_state->pics[0]) irqbase = 0; @@ -263,7 +238,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) if (s->irr & (1 << irq) || s->isr & (1 << irq)) { n = irq + irqbase; - s->pics_state->pending_acks |= 1 << n; + kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); } } s->last_irr = 0; @@ -428,8 +403,7 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, - int len, int is_write) +static int picdev_in_range(gpa_t addr) { switch (addr) { case 0x20: @@ -444,18 +418,25 @@ static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, } } -static void picdev_write(struct kvm_io_device *this, +static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_pic, dev); +} + +static int picdev_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte write\n"); - return; + return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -468,21 +449,24 @@ static void picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - pic_unlock(s); + spin_unlock(&s->lock); + return 0; } -static void picdev_read(struct kvm_io_device *this, - gpa_t addr, int len, void *val) +static int picdev_read(struct kvm_io_device *this, + gpa_t addr, int len, void *val) { - struct kvm_pic *s = this->private; + struct kvm_pic *s = to_pic(this); unsigned char data = 0; + if (!picdev_in_range(addr)) + return -EOPNOTSUPP; if (len != 1) { if (printk_ratelimit()) printk(KERN_ERR "PIC: non byte read\n"); - return; + return 0; } - pic_lock(s); + spin_lock(&s->lock); switch (addr) { case 0x20: case 0x21: @@ -496,7 +480,8 @@ static void picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - pic_unlock(s); + spin_unlock(&s->lock); + return 0; } /* @@ -505,20 +490,27 @@ static void picdev_read(struct kvm_io_device *this, static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; - struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - s->wakeup_needed = true; + kvm_vcpu_kick(vcpu); } } +static const struct kvm_io_device_ops picdev_ops = { + .read = picdev_read, + .write = picdev_write, +}; + struct kvm_pic *kvm_create_pic(struct kvm *kvm) { struct kvm_pic *s; + int ret; + s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; @@ -534,10 +526,12 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - s->dev.read = picdev_read; - s->dev.write = picdev_write; - s->dev.in_range = picdev_in_range; - s->dev.private = s; - kvm_io_bus_register_dev(&kvm->pio_bus, &s->dev); + kvm_iodevice_init(&s->dev, &picdev_ops); + ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + if (ret < 0) { + kfree(s); + return NULL; + } + return s; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f593188129..7d6058a2fd3 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,7 +63,6 @@ struct kvm_kpic_state { struct kvm_pic { spinlock_t lock; - bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d..7bcc5b6a440 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) +{ + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + + return vcpu->arch.pdptrs[index]; +} + #endif diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h deleted file mode 100644 index ed66e4c078d..00000000000 --- a/arch/x86/kvm/kvm_svm.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __KVM_SVM_H -#define __KVM_SVM_H - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <asm/msr.h> - -#include <asm/svm.h> - -static const u32 host_save_user_msrs[] = { -#ifdef CONFIG_X86_64 - MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, - MSR_FS_BASE, -#endif - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, -}; - -#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) - -struct kvm_vcpu; - -struct vcpu_svm { - struct kvm_vcpu vcpu; - struct vmcb *vmcb; - unsigned long vmcb_pa; - struct svm_cpu_data *svm_data; - uint64_t asid_generation; - - u64 next_rip; - - u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; - u64 host_gs_base; - unsigned long host_cr2; - - u32 *msrpm; - struct vmcb *hsave; - u64 hsave_msr; - - u64 nested_vmcb; - - /* These are the merged vectors */ - u32 *nested_msrpm; - - /* gpa pointers to the real vectors */ - u64 nested_vmcb_msrpm; -}; - -#endif - diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 26bd6ba74e1..55c7524dda5 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -6,7 +6,7 @@ struct kvm_timer { bool reinject; struct kvm_timer_ops *t_ops; struct kvm *kvm; - int vcpu_id; + struct kvm_vcpu *vcpu; }; struct kvm_timer_ops { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ae99d83f81a..1ae5ceba7eb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,8 +32,11 @@ #include <asm/current.h> #include <asm/apicdef.h> #include <asm/atomic.h> +#include <asm/apicdef.h> #include "kvm_cache_regs.h" #include "irq.h" +#include "trace.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -141,6 +144,26 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +void kvm_apic_set_version(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *feat; + u32 v = APIC_VERSION; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + v |= APIC_LVR_DIRECTED_EOI; + apic_set_reg(apic, APIC_LVR, v); +} + +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -165,36 +188,52 @@ static int find_highest_vector(void *bitmap) static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { + apic->irr_pending = true; return apic_test_and_set_vector(vec, apic->regs + APIC_IRR); } -static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +static inline int apic_search_irr(struct kvm_lapic *apic) { - apic_clear_vector(vec, apic->regs + APIC_IRR); + return find_highest_vector(apic->regs + APIC_IRR); } static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; - result = find_highest_vector(apic->regs + APIC_IRR); + if (!apic->irr_pending) + return -1; + + result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); return result; } +static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) +{ + apic->irr_pending = false; + apic_clear_vector(vec, apic->regs + APIC_IRR); + if (apic_search_irr(apic) != -1) + apic->irr_pending = true; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; + /* This may race with setting of irr in __apic_accept_irq() and + * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq + * will cause vmexit immediately and the value will be recalculated + * on the next vmentry. + */ if (!apic) return 0; highest_irr = apic_find_highest_irr(apic); return highest_irr; } -EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr); static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode); @@ -251,7 +290,12 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest) int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) { int result = 0; - u8 logical_id; + u32 logical_id; + + if (apic_x2apic_mode(apic)) { + logical_id = apic_get_reg(apic, APIC_LDR); + return logical_id & mda; + } logical_id = GET_APIC_LOGICAL_ID(apic_get_reg(apic, APIC_LDR)); @@ -331,6 +375,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; result = !apic_test_and_set_irr(vector, apic); + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, !result); if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly for " @@ -425,7 +471,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { + mutex_lock(&apic->vcpu->kvm->irq_lock); + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + mutex_unlock(&apic->vcpu->kvm->irq_lock); + } } static void apic_send_ipi(struct kvm_lapic *apic) @@ -440,7 +490,12 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.level = icr_low & APIC_INT_ASSERT; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; - irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + if (apic_x2apic_mode(apic)) + irq.dest_id = icr_high; + else + irq.dest_id = GET_APIC_DEST_FIELD(icr_high); + + trace_kvm_apic_ipi(icr_low, irq.dest_id); apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " @@ -449,7 +504,9 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); + mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); + mutex_unlock(&apic->vcpu->kvm->irq_lock); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -495,12 +552,16 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); - if (offset >= LAPIC_MMIO_LENGTH) return 0; switch (offset) { + case APIC_ID: + if (apic_x2apic_mode(apic)) + val = kvm_apic_id(apic); + else + val = kvm_apic_id(apic) << 24; + break; case APIC_ARBPRI: printk(KERN_WARNING "Access APIC ARBPRI register " "which is for P6\n"); @@ -522,21 +583,35 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return val; } -static void apic_mmio_read(struct kvm_io_device *this, - gpa_t address, int len, void *data) +static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) +{ + return container_of(dev, struct kvm_lapic, dev); +} + +static int apic_reg_read(struct kvm_lapic *apic, u32 offset, int len, + void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; unsigned char alignment = offset & 0xf; u32 result; + /* this bitmask has a bit cleared for each reserver register */ + static const u64 rmask = 0x43ff01ffffffe70cULL; if ((alignment + len) > 4) { - printk(KERN_ERR "KVM_APIC_READ: alignment error %lx %d", - (unsigned long)address, len); - return; + apic_debug("KVM_APIC_READ: alignment error %x %d\n", + offset, len); + return 1; } + + if (offset > 0x3f0 || !(rmask & (1ULL << (offset >> 4)))) { + apic_debug("KVM_APIC_READ: read reserved register %x\n", + offset); + return 1; + } + result = __apic_read(apic, offset & ~0xf); + trace_kvm_apic_read(offset, result); + switch (len) { case 1: case 2: @@ -548,6 +623,28 @@ static void apic_mmio_read(struct kvm_io_device *this, "should be 1,2, or 4 instead\n", len); break; } + return 0; +} + +static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) +{ + return apic_hw_enabled(apic) && + addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; +} + +static int apic_mmio_read(struct kvm_io_device *this, + gpa_t address, int len, void *data) +{ + struct kvm_lapic *apic = to_lapic(this); + u32 offset = address - apic->base_address; + + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; + + apic_reg_read(apic, offset, len, data); + + return 0; } static void update_divide_count(struct kvm_lapic *apic) @@ -573,6 +670,15 @@ static void start_apic_timer(struct kvm_lapic *apic) if (!apic->lapic_timer.period) return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + if (apic->lapic_timer.period < NSEC_PER_MSEC/2) + apic->lapic_timer.period = NSEC_PER_MSEC/2; + } hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), @@ -603,40 +709,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) apic->vcpu->kvm->arch.vapics_in_nmi_mode--; } -static void apic_mmio_write(struct kvm_io_device *this, - gpa_t address, int len, const void *data) +static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - unsigned int offset = address - apic->base_address; - unsigned char alignment = offset & 0xf; - u32 val; - - /* - * APIC register must be aligned on 128-bits boundary. - * 32/64/128 bits registers must be accessed thru 32 bits. - * Refer SDM 8.4.1 - */ - if (len != 4 || alignment) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", - len, (long)address); - return; - } - - val = *(u32 *) data; - - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); - - offset &= 0xff0; + int ret = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + trace_kvm_apic_write(reg, val); - switch (offset) { + switch (reg) { case APIC_ID: /* Local APIC ID */ - apic_set_reg(apic, APIC_ID, val); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_ID, val); + else + ret = 1; break; case APIC_TASKPRI: @@ -649,15 +733,24 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_LDR: - apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_LDR, val & APIC_LDR_MASK); + else + ret = 1; break; case APIC_DFR: - apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + if (!apic_x2apic_mode(apic)) + apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); + else + ret = 1; break; - case APIC_SPIV: - apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + case APIC_SPIV: { + u32 mask = 0x3ff; + if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + mask |= APIC_SPIV_DIRECTED_EOI; + apic_set_reg(apic, APIC_SPIV, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; @@ -672,7 +765,7 @@ static void apic_mmio_write(struct kvm_io_device *this, } break; - + } case APIC_ICR: /* No delay here, so we always clear the pending bit */ apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); @@ -680,7 +773,9 @@ static void apic_mmio_write(struct kvm_io_device *this, break; case APIC_ICR2: - apic_set_reg(apic, APIC_ICR2, val & 0xff000000); + if (!apic_x2apic_mode(apic)) + val &= 0xff000000; + apic_set_reg(apic, APIC_ICR2, val); break; case APIC_LVT0: @@ -694,8 +789,8 @@ static void apic_mmio_write(struct kvm_io_device *this, if (!apic_sw_enabled(apic)) val |= APIC_LVT_MASKED; - val &= apic_lvt_mask[(offset - APIC_LVTT) >> 4]; - apic_set_reg(apic, offset, val); + val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; + apic_set_reg(apic, reg, val); break; @@ -703,7 +798,7 @@ static void apic_mmio_write(struct kvm_io_device *this, hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); - return; + break; case APIC_TDCR: if (val & 4) @@ -712,27 +807,59 @@ static void apic_mmio_write(struct kvm_io_device *this, update_divide_count(apic); break; + case APIC_ESR: + if (apic_x2apic_mode(apic) && val != 0) { + printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + ret = 1; + } + break; + + case APIC_SELF_IPI: + if (apic_x2apic_mode(apic)) { + apic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + } else + ret = 1; + break; default: - apic_debug("Local APIC Write to read-only register %x\n", - offset); + ret = 1; break; } - + if (ret) + apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; } -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, - int len, int size) +static int apic_mmio_write(struct kvm_io_device *this, + gpa_t address, int len, const void *data) { - struct kvm_lapic *apic = (struct kvm_lapic *)this->private; - int ret = 0; + struct kvm_lapic *apic = to_lapic(this); + unsigned int offset = address - apic->base_address; + u32 val; + if (!apic_mmio_in_range(apic, address)) + return -EOPNOTSUPP; - if (apic_hw_enabled(apic) && - (addr >= apic->base_address) && - (addr < (apic->base_address + LAPIC_MMIO_LENGTH))) - ret = 1; + /* + * APIC register must be aligned on 128-bits boundary. + * 32/64/128 bits registers must be accessed thru 32 bits. + * Refer SDM 8.4.1 + */ + if (len != 4 || (offset & 0xf)) { + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + return 0; + } - return ret; + val = *(u32*)data; + + /* too common printing */ + if (offset != APIC_EOI) + apic_debug("%s: offset 0x%x with length 0x%x, and value is " + "0x%x\n", __func__, offset, len, val); + + apic_reg_write(apic, offset & 0xff0, val); + + return 0; } void kvm_free_lapic(struct kvm_vcpu *vcpu) @@ -763,7 +890,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } -EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { @@ -776,7 +902,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) return (tpr & 0xf0) >> 4; } -EXPORT_SYMBOL_GPL(kvm_lapic_get_cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { @@ -787,10 +912,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; return; } - if (apic->vcpu->vcpu_id) + + if (!kvm_vcpu_is_bsp(apic->vcpu)) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; + if (apic_x2apic_mode(apic)) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); + apic_set_reg(apic, APIC_LDR, ldr); + } apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -800,12 +931,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_lapic_get_base); - void kvm_lapic_reset(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -821,7 +946,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); @@ -842,9 +967,10 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } + apic->irr_pending = false; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (vcpu->vcpu_id == 0) + if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; apic_update_ppr(apic); @@ -855,7 +981,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); } -EXPORT_SYMBOL_GPL(kvm_lapic_reset); bool kvm_apic_present(struct kvm_vcpu *vcpu) { @@ -866,7 +991,6 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu) { return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_enabled); /* *---------------------------------------------------------------------- @@ -917,6 +1041,11 @@ static struct kvm_timer_ops lapic_timer_ops = { .is_periodic = lapic_is_periodic, }; +static const struct kvm_io_device_ops apic_mmio_ops = { + .read = apic_mmio_read, + .write = apic_mmio_write, +}; + int kvm_create_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic; @@ -945,16 +1074,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->lapic_timer.timer.function = kvm_timer_fn; apic->lapic_timer.t_ops = &lapic_timer_ops; apic->lapic_timer.kvm = vcpu->kvm; - apic->lapic_timer.vcpu_id = vcpu->vcpu_id; + apic->lapic_timer.vcpu = vcpu; apic->base_address = APIC_DEFAULT_PHYS_BASE; vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE; kvm_lapic_reset(vcpu); - apic->dev.read = apic_mmio_read; - apic->dev.write = apic_mmio_write; - apic->dev.in_range = apic_mmio_range; - apic->dev.private = apic; + kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; nomem_free_apic: @@ -962,7 +1088,6 @@ nomem_free_apic: nomem: return -ENOMEM; } -EXPORT_SYMBOL_GPL(kvm_create_lapic); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { @@ -985,7 +1110,7 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (vcpu->vcpu_id == 0) { + if (kvm_vcpu_is_bsp(vcpu)) { if (!apic_hw_enabled(vcpu->arch.apic)) r = 1; if ((lvt0 & APIC_LVT_MASKED) == 0 && @@ -1025,7 +1150,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) apic->base_address = vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(vcpu); + apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); @@ -1092,3 +1218,35 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) vcpu->arch.apic->vapic_addr = vapic_addr; } + +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + /* if this is ICR write vector before command */ + if (msr == 0x830) + apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32)); + return apic_reg_write(apic, reg, (u32)data); +} + +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0; + + if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic)) + return 1; + + if (apic_reg_read(apic, reg, 4, &low)) + return 1; + if (msr == 0x830) + apic_reg_read(apic, APIC_ICR2, 4, &high); + + *data = (((u64)high) << 32) | low; + + return 0; +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index a587f8349c4..40010b09c4a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -12,6 +12,7 @@ struct kvm_lapic { struct kvm_timer lapic_timer; u32 divide_count; struct kvm_vcpu *vcpu; + bool irr_pending; struct page *regs_page; void *regs; gpa_t vapic_addr; @@ -28,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); @@ -44,4 +46,6 @@ void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b404..eca41ae9f45 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "kvm_cache_regs.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -107,6 +108,9 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_MASK(level) \ (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) +#define PT32_LVL_OFFSET_MASK(level) \ + (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT32_INDEX(address, level)\ (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) @@ -115,10 +119,19 @@ module_param(oos_shadow, bool, 0644); #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) +#define PT64_LVL_ADDR_MASK(level) \ + (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) #define PT32_BASE_ADDR_MASK PAGE_MASK #define PT32_DIR_BASE_ADDR_MASK \ (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) +#define PT32_LVL_ADDR_MASK(level) \ + (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT32_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) @@ -129,6 +142,7 @@ module_param(oos_shadow, bool, 0644); #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) +#define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 @@ -139,10 +153,13 @@ module_param(oos_shadow, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define CREATE_TRACE_POINTS +#include "mmutrace.h" + #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) struct kvm_rmap_desc { - u64 *shadow_ptes[RMAP_EXT]; + u64 *sptes[RMAP_EXT]; struct kvm_rmap_desc *more; }; @@ -239,16 +256,25 @@ static int is_writeble_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static int is_dirty_pte(unsigned long pte) +static int is_dirty_gpte(unsigned long pte) { - return pte & shadow_dirty_mask; + return pte & PT_DIRTY_MASK; } -static int is_rmap_pte(u64 pte) +static int is_rmap_spte(u64 pte) { return is_shadow_present_pte(pte); } +static int is_last_spte(u64 pte, int level) +{ + if (level == PT_PAGE_TABLE_LEVEL) + return 1; + if (is_large_pte(pte)) + return 1; + return 0; +} + static pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -261,7 +287,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) return (gpte & PT32_DIR_PSE36_MASK) << shift; } -static void set_shadow_pte(u64 *sptep, u64 spte) +static void __set_spte(u64 *sptep, u64 spte) { #ifdef CONFIG_X86_64 set_64bit((unsigned long *)sptep, spte); @@ -380,37 +406,52 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. */ -static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) +static int *slot_largepage_idx(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); - return &slot->lpage_info[idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + return &slot->lpage_info[level - 2][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count += 1; + + slot = gfn_to_memslot_unaliased(kvm, gfn); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + write_count = slot_largepage_idx(gfn, slot, i); + *write_count += 1; + } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { + struct kvm_memory_slot *slot; int *write_count; + int i; gfn = unalias_gfn(kvm, gfn); - write_count = slot_largepage_idx(gfn, - gfn_to_memslot_unaliased(kvm, gfn)); - *write_count -= 1; - WARN_ON(*write_count < 0); + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + slot = gfn_to_memslot_unaliased(kvm, gfn); + write_count = slot_largepage_idx(gfn, slot, i); + *write_count -= 1; + WARN_ON(*write_count < 0); + } } -static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) +static int has_wrprotected_page(struct kvm *kvm, + gfn_t gfn, + int level) { struct kvm_memory_slot *slot; int *largepage_idx; @@ -418,47 +459,67 @@ static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) gfn = unalias_gfn(kvm, gfn); slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { - largepage_idx = slot_largepage_idx(gfn, slot); + largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; } return 1; } -static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) +static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { + unsigned long page_size = PAGE_SIZE; struct vm_area_struct *vma; unsigned long addr; - int ret = 0; + int i, ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return ret; + return page_size; down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) - ret = 1; + if (!vma) + goto out; + + page_size = vma_kernel_pagesize(vma); + +out: up_read(¤t->mm->mmap_sem); + for (i = PT_PAGE_TABLE_LEVEL; + i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + if (page_size >= KVM_HPAGE_SIZE(i)) + ret = i; + else + break; + } + return ret; } -static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - - if (has_wrprotected_page(vcpu->kvm, large_gfn)) - return 0; - - if (!host_largepage_backed(vcpu->kvm, large_gfn)) - return 0; + int host_level; + int level = PT_PAGE_TABLE_LEVEL; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) - return 0; + return PT_PAGE_TABLE_LEVEL; - return 1; + host_level = host_mapping_level(vcpu->kvm, large_gfn); + + if (host_level == PT_PAGE_TABLE_LEVEL) + return host_level; + + for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { + + if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + break; + } + + return level - 1; } /* @@ -466,19 +527,19 @@ static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) * Note: gfn must be unaliased before this function get called */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; unsigned long idx; slot = gfn_to_memslot(kvm, gfn); - if (!lpage) + if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); + idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); - return &slot->lpage_info[idx].rmap_pde; + return &slot->lpage_info[level - 2][idx].rmap_pde; } /* @@ -494,42 +555,42 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) * the spte was not added. * */ -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) +static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; unsigned long *rmapp; int i, count = 0; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); *rmapp = (unsigned long)spte; } else if (!(*rmapp & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)*rmapp; - desc->shadow_ptes[1] = spte; + desc->sptes[0] = (u64 *)*rmapp; + desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) { + while (desc->sptes[RMAP_EXT-1] && desc->more) { desc = desc->more; count += RMAP_EXT; } - if (desc->shadow_ptes[RMAP_EXT-1]) { + if (desc->sptes[RMAP_EXT-1]) { desc->more = mmu_alloc_rmap_desc(vcpu); desc = desc->more; } - for (i = 0; desc->shadow_ptes[i]; ++i) + for (i = 0; desc->sptes[i]; ++i) ; - desc->shadow_ptes[i] = spte; + desc->sptes[i] = spte; } return count; } @@ -541,14 +602,14 @@ static void rmap_desc_remove_entry(unsigned long *rmapp, { int j; - for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j) + for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) ; - desc->shadow_ptes[i] = desc->shadow_ptes[j]; - desc->shadow_ptes[j] = NULL; + desc->sptes[i] = desc->sptes[j]; + desc->sptes[j] = NULL; if (j != 0) return; if (!prev_desc && !desc->more) - *rmapp = (unsigned long)desc->shadow_ptes[0]; + *rmapp = (unsigned long)desc->sptes[0]; else if (prev_desc) prev_desc->more = desc->more; @@ -566,7 +627,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) unsigned long *rmapp; int i; - if (!is_rmap_pte(*spte)) + if (!is_rmap_spte(*spte)) return; sp = page_header(__pa(spte)); pfn = spte_to_pfn(*spte); @@ -576,7 +637,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) kvm_release_pfn_dirty(pfn); else kvm_release_pfn_clean(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -593,8 +654,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) - if (desc->shadow_ptes[i] == spte) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) + if (desc->sptes[i] == spte) { rmap_desc_remove_entry(rmapp, desc, i, prev_desc); @@ -625,10 +686,10 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) prev_desc = NULL; prev_spte = NULL; while (desc) { - for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) { + for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { if (prev_spte == spte) - return desc->shadow_ptes[i]; - prev_spte = desc->shadow_ptes[i]; + return desc->sptes[i]; + prev_spte = desc->sptes[i]; } desc = desc->more; } @@ -639,10 +700,10 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; - int write_protected = 0; + int i, write_protected = 0; gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn, 0); + rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -650,7 +711,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writeble_pte(*spte)) { - set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK); + __set_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -664,21 +725,24 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } /* check for huge page mappings */ - rmapp = gfn_to_rmap(kvm, gfn, 1); - spte = rmap_next(kvm, rmapp, NULL); - while (spte) { - BUG_ON(!spte); - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writeble_pte(*spte)) { - rmap_remove(kvm, spte); - --kvm->stat.lpages; - set_shadow_pte(spte, shadow_trap_nonpresent_pte); - spte = NULL; - write_protected = 1; + for (i = PT_DIRECTORY_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = gfn_to_rmap(kvm, gfn, i); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writeble_pte(*spte)) { + rmap_remove(kvm, spte); + --kvm->stat.lpages; + __set_spte(spte, shadow_trap_nonpresent_pte); + spte = NULL; + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); } - spte = rmap_next(kvm, rmapp, spte); } return write_protected; @@ -693,7 +757,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); rmap_remove(kvm, spte); - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -702,7 +766,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int (*handler)(struct kvm *kvm, unsigned long *rmapp)) { - int i; + int i, j; int retval = 0; /* @@ -721,11 +785,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + retval |= handler(kvm, &memslot->rmap[gfn_offset]); - retval |= handler(kvm, - &memslot->lpage_info[ - gfn_offset / - KVM_PAGES_PER_HPAGE].rmap_pde); + + for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { + int idx = gfn_offset; + idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + retval |= handler(kvm, + &memslot->lpage_info[j][idx].rmap_pde); + } } } @@ -763,12 +831,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { unsigned long *rmapp; + struct kvm_mmu_page *sp; + + sp = page_header(__pa(spte)); gfn = unalias_gfn(vcpu->kvm, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp); kvm_flush_remote_tlbs(vcpu->kvm); @@ -1109,6 +1180,7 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } + trace_kvm_mmu_sync_page(sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); @@ -1231,8 +1303,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - pgprintk("%s: looking gfn %lx role %x\n", __func__, - gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) @@ -1249,14 +1319,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(vcpu, sp); } - pgprintk("%s: found\n", __func__); + trace_kvm_mmu_get_page(sp, false); return sp; } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; - pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); @@ -1269,6 +1338,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, vcpu->arch.mmu.prefetch_page(vcpu, sp); else nonpaging_prefetch_page(vcpu, sp); + trace_kvm_mmu_get_page(sp, true); return sp; } @@ -1292,6 +1362,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) return false; + + if (iterator->level == PT_PAGE_TABLE_LEVEL) + if (is_large_pte(*iterator->sptep)) + return false; + iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; @@ -1312,25 +1387,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, pt = sp->spt; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (is_shadow_present_pte(pt[i])) - rmap_remove(kvm, &pt[i]); - pt[i] = shadow_trap_nonpresent_pte; - } - return; - } - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; if (is_shadow_present_pte(ent)) { - if (!is_large_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { ent &= PT64_BASE_ADDR_MASK; mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } else { - --kvm->stat.lpages; + if (is_large_pte(ent)) + --kvm->stat.lpages; rmap_remove(kvm, &pt[i]); } } @@ -1346,10 +1413,10 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) { int i; + struct kvm_vcpu *vcpu; - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm->vcpus[i]->arch.last_pte_updated = NULL; + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.last_pte_updated = NULL; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1368,7 +1435,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } BUG_ON(!parent_pte); kvm_mmu_put_page(sp, parent_pte); - set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); + __set_spte(parent_pte, shadow_trap_nonpresent_pte); } } @@ -1400,6 +1467,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { int ret; + + trace_kvm_mmu_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); @@ -1516,7 +1585,7 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { if (pt[i] == shadow_notrap_nonpresent_pte) - set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + __set_spte(&pt[i], shadow_trap_nonpresent_pte); } } @@ -1646,6 +1715,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) struct kvm_mmu_page *s; struct hlist_node *node, *n; + trace_kvm_mmu_unsync_page(sp); index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ @@ -1682,9 +1752,9 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, - int write_fault, int dirty, int largepage, + int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { @@ -1707,7 +1777,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (largepage) + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, @@ -1718,7 +1788,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + if (level > PT_PAGE_TABLE_LEVEL && + has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; @@ -1732,7 +1803,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, * is responsibility of mmu_get_page / kvm_sync_page. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writeble_pte(*shadow_pte)) + if (!can_unsync && is_writeble_pte(*sptep)) goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { @@ -1749,65 +1820,67 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, mark_page_dirty(vcpu->kvm, gfn); set_pte: - set_shadow_pte(shadow_pte, spte); + __set_spte(sptep, spte); return ret; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, + int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); + int was_writeble = is_writeble_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, + __func__, *sptep, pt_access, write_fault, user_fault, gfn); - if (is_rmap_pte(*shadow_pte)) { + if (is_rmap_spte(*sptep)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (largepage && !is_large_pte(*shadow_pte)) { + if (level > PT_PAGE_TABLE_LEVEL && + !is_large_pte(*sptep)) { struct kvm_mmu_page *child; - u64 pte = *shadow_pte; + u64 pte = *sptep; child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { + mmu_page_remove_parent_pte(child, sptep); + } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); + spte_to_pfn(*sptep), pfn); + rmap_remove(vcpu->kvm, sptep); } else was_rmapped = 1; } - if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, + dirty, level, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); } - pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - is_large_pte(*shadow_pte)? "2MB" : "4kB", - is_present_pte(*shadow_pte)?"RW":"R", gfn, - *shadow_pte, shadow_pte); - if (!was_rmapped && is_large_pte(*shadow_pte)) + is_large_pte(*sptep)? "2MB" : "4kB", + *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep, sptep); + if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; - page_header_update_slot(vcpu->kvm, shadow_pte, gfn); + page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { - rmap_count = rmap_add(vcpu, shadow_pte, gfn, largepage); - if (!is_rmap_pte(*shadow_pte)) + rmap_count = rmap_add(vcpu, sptep, gfn); + if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, gfn, largepage); + rmap_recycle(vcpu, sptep, gfn); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -1815,7 +1888,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, kvm_release_pfn_clean(pfn); } if (speculative) { - vcpu->arch.last_pte_updated = shadow_pte; + vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; } } @@ -1825,7 +1898,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) + int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -1833,11 +1906,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == PT_PAGE_TABLE_LEVEL - || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, gfn, pfn, false); + level, gfn, pfn, false); ++vcpu->stat.pf_fixed; break; } @@ -1853,10 +1925,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - set_shadow_pte(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + __set_spte(iterator.sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } } return pt_write; @@ -1865,14 +1937,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - int largepage = 0; + int level; pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + /* + * This path builds a PAE pagetable - so we can map 2mb pages at + * maximum. Therefore check if the level is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -1888,7 +1966,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn); + r = __direct_map(vcpu, v, write, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1954,6 +2032,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; struct kvm_mmu_page *sp; int direct = 0; + u64 pdptr; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1981,11 +2060,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + root_gfn = pdptr >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) @@ -2062,7 +2142,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, { pfn_t pfn; int r; - int largepage = 0; + int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; @@ -2073,10 +2153,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2089,7 +2169,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn); + level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2206,7 +2286,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2]; + context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51) | + rsvd_bits(13, 29); context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ @@ -2357,8 +2439,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); if (r) goto out; + /* set_cr3() should ensure TLB has been flushed */ kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); - kvm_mmu_flush_tlb(vcpu); out: return r; } @@ -2378,15 +2460,14 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL || - is_large_pte(pte)) + if (is_last_spte(pte, sp->role.level)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); } } - set_shadow_pte(spte, shadow_trap_nonpresent_pte); + __set_spte(spte, shadow_trap_nonpresent_pte); if (is_large_pte(pte)) --vcpu->kvm->stat.lpages; } @@ -2397,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (!vcpu->arch.update_pte.largepage || - sp->role.glevels == PT32_ROOT_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + ++vcpu->kvm->stat.mmu_pde_zapped; + return; } ++vcpu->kvm->stat.mmu_pte_updated; @@ -2447,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.largepage = 0; - if (bytes != 4 && bytes != 8) return; @@ -2472,14 +2548,10 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if ((bytes == 4) && (gpa % 4 == 0)) memcpy((void *)&gpte, new, 4); } - if (!is_present_pte(gpte)) + if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); - vcpu->arch.update_pte.largepage = 1; - } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2622,6 +2694,9 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; + if (tdp_enabled) + return 0; + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); spin_lock(&vcpu->kvm->mmu_lock); @@ -2633,7 +2708,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES) { + while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, @@ -2670,8 +2746,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ++vcpu->stat.mmio_exits; return 0; case EMULATE_FAIL: - kvm_report_emulation_failure(vcpu, "pagetable"); - return 1; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + return 0; default: BUG(); } @@ -2712,12 +2789,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - if (vcpu->kvm->arch.n_requested_mmu_pages) - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_requested_mmu_pages; - else - vcpu->kvm->arch.n_free_mmu_pages = - vcpu->kvm->arch.n_alloc_mmu_pages; /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first @@ -3029,6 +3100,24 @@ out: return r; } +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) +{ + struct kvm_shadow_walk_iterator iterator; + int nr_sptes = 0; + + spin_lock(&vcpu->kvm->mmu_lock); + for_each_shadow_entry(vcpu, addr, iterator) { + sptes[iterator.level-1] = *iterator.sptep; + nr_sptes++; + if (!is_shadow_present_pte(*iterator.sptep)) + break; + } + spin_unlock(&vcpu->kvm->mmu_lock); + + return nr_sptes; +} +EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); + #ifdef AUDIT static const char *audit_msg; @@ -3041,6 +3130,54 @@ static gva_t canonicalize(gva_t gva) return gva; } + +typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, + u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } else + fn(kvm, sp, &sp->spt[i]); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va, int level) { @@ -3055,20 +3192,19 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, continue; va = canonicalize(va); - if (level > 1) { - if (ent == shadow_notrap_nonpresent_pte) - printk(KERN_ERR "audit: (%s) nontrapping pte" - " in nonleaf level: levels %d gva %lx" - " level %d pte %llx\n", audit_msg, - vcpu->arch.mmu.root_level, va, level, ent); - else - audit_mappings_page(vcpu, ent, va, level - 1); - } else { + if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) + audit_mappings_page(vcpu, ent, va, level - 1); + else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + continue; + } + if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) printk(KERN_ERR "xx audit error: (%s) levels %d" @@ -3122,7 +3258,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); while (d) { for (k = 0; k < RMAP_EXT; ++k) - if (d->shadow_ptes[k]) + if (d->sptes[k]) ++nmaps; else break; @@ -3133,9 +3269,48 @@ static int count_rmaps(struct kvm_vcpu *vcpu) return nmaps; } -static int count_writable_mappings(struct kvm_vcpu *vcpu) +void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + if (*sptep & PT_WRITABLE_MASK) { + rev_sp = page_header(__pa(sptep)); + gfn = rev_sp->gfns[sptep - rev_sp->spt]; + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %ld\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + audit_msg, sptep - rev_sp->spt, + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], + is_large_pte(*sptep)); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } + } + +} + +void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) { - int nmaps = 0; struct kvm_mmu_page *sp; int i; @@ -3152,20 +3327,16 @@ static int count_writable_mappings(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - ++nmaps; + inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); } } - return nmaps; + return; } static void audit_rmap(struct kvm_vcpu *vcpu) { - int n_rmap = count_rmaps(vcpu); - int n_actual = count_writable_mappings(vcpu); - - if (n_rmap != n_actual) - printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __func__, audit_msg, n_rmap, n_actual); + check_writable_mappings_rmap(vcpu); + count_rmaps(vcpu); } static void audit_write_protection(struct kvm_vcpu *vcpu) @@ -3173,20 +3344,28 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; struct kvm_memory_slot *slot; unsigned long *rmapp; + u64 *spte; gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { if (sp->role.direct) continue; + if (sp->unsync) + continue; gfn = unalias_gfn(vcpu->kvm, sp->gfn); slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; - if (*rmapp) - printk(KERN_ERR "%s: (%s) shadow page has writable" - " mappings: gfn %lx role %x\n", + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (*spte & PT_WRITABLE_MASK) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } } } @@ -3198,7 +3377,9 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_msg = msg; audit_rmap(vcpu); audit_write_protection(vcpu); - audit_mappings(vcpu); + if (strcmp("pre pte write", audit_msg) != 0) + audit_mappings(vcpu); + audit_writable_sptes_have_rmaps(vcpu); dbg = olddbg; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3494a2fb136..61a1b3884b4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -37,6 +37,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) @@ -75,7 +77,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return vcpu->arch.cr0 & X86_CR0_PG; } -static inline int is_present_pte(unsigned long pte) +static inline int is_present_gpte(unsigned long pte) { return pte & PT_PRESENT_MASK; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h new file mode 100644 index 00000000000..3e4a5c6ca2a --- /dev/null +++ b/arch/x86/kvm/mmutrace.h @@ -0,0 +1,220 @@ +#if !defined(_TRACE_KVMMMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVMMMU_H + +#include <linux/tracepoint.h> +#include <linux/ftrace_event.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvmmmu +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE mmutrace + +#define KVM_MMU_PAGE_FIELDS \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ + __field(__u32, unsync) + +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ + __entry->unsync = sp->unsync; + +#define KVM_MMU_PAGE_PRINTK() ({ \ + const char *ret = p->buffer + p->len; \ + static const char *access_str[] = { \ + "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ + }; \ + union kvm_mmu_page_role role; \ + \ + role.word = __entry->role; \ + \ + trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ + " %snxe root %u %s%c", \ + __entry->gfn, role.level, role.glevels, \ + role.quadrant, \ + role.direct ? " direct" : "", \ + access_str[role.access], \ + role.invalid ? " invalid" : "", \ + role.cr4_pge ? "" : "!", \ + role.nxe ? "" : "!", \ + __entry->root_count, \ + __entry->unsync ? "unsync" : "sync", 0); \ + ret; \ + }) + +#define kvm_mmu_trace_pferr_flags \ + { PFERR_PRESENT_MASK, "P" }, \ + { PFERR_WRITE_MASK, "W" }, \ + { PFERR_USER_MASK, "U" }, \ + { PFERR_RSVD_MASK, "RSVD" }, \ + { PFERR_FETCH_MASK, "F" } + +/* + * A pagetable walk has started + */ +TRACE_EVENT( + kvm_mmu_pagetable_walk, + TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), + TP_ARGS(addr, write_fault, user_fault, fetch_fault), + + TP_STRUCT__entry( + __field(__u64, addr) + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) + | (!!fetch_fault << 4); + ), + + TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + + +/* We just walked a paging element */ +TRACE_EVENT( + kvm_mmu_paging_element, + TP_PROTO(u64 pte, int level), + TP_ARGS(pte, level), + + TP_STRUCT__entry( + __field(__u64, pte) + __field(__u32, level) + ), + + TP_fast_assign( + __entry->pte = pte; + __entry->level = level; + ), + + TP_printk("pte %llx level %u", __entry->pte, __entry->level) +); + +/* We set a pte accessed bit */ +TRACE_EVENT( + kvm_mmu_set_accessed_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +/* We set a pte dirty bit */ +TRACE_EVENT( + kvm_mmu_set_dirty_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), + + TP_STRUCT__entry( + __field(__u64, gpa) + ), + + TP_fast_assign( + __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) + + index * size; + ), + + TP_printk("gpa %llx", __entry->gpa) +); + +TRACE_EVENT( + kvm_mmu_walker_error, + TP_PROTO(u32 pferr), + TP_ARGS(pferr), + + TP_STRUCT__entry( + __field(__u32, pferr) + ), + + TP_fast_assign( + __entry->pferr = pferr; + ), + + TP_printk("pferr %x %s", __entry->pferr, + __print_flags(__entry->pferr, "|", kvm_mmu_trace_pferr_flags)) +); + +TRACE_EVENT( + kvm_mmu_get_page, + TP_PROTO(struct kvm_mmu_page *sp, bool created), + TP_ARGS(sp, created), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + __field(bool, created) + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + __entry->created = created; + ), + + TP_printk("%s %s", KVM_MMU_PAGE_PRINTK(), + __entry->created ? "new" : "existing") +); + +TRACE_EVENT( + kvm_mmu_sync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_unsync_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +TRACE_EVENT( + kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), + TP_ARGS(sp), + + TP_STRUCT__entry( + KVM_MMU_PAGE_FIELDS + ), + + TP_fast_assign( + KVM_MMU_PAGE_ASSIGN(sp) + ), + + TP_printk("%s", KVM_MMU_PAGE_PRINTK()) +); + +#endif /* _TRACE_KVMMMU_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f63539..d2fec9c12d2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -27,7 +27,8 @@ #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS @@ -43,7 +44,8 @@ #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK - #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK + #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) + #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS @@ -53,8 +55,8 @@ #error Invalid PTTYPE value #endif -#define gpte_to_gfn FNAME(gpte_to_gfn) -#define gpte_to_gfn_pde FNAME(gpte_to_gfn_pde) +#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) /* * The guest_walker structure emulates the behavior of the hardware page @@ -71,14 +73,9 @@ struct guest_walker { u32 error_code; }; -static gfn_t gpte_to_gfn(pt_element_t gpte) +static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { - return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; -} - -static gfn_t gpte_to_gfn_pde(pt_element_t gpte) -{ - return (gpte & PT_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, @@ -125,14 +122,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker, gpa_t pte_gpa; int rsvd_fault = 0; - pgprintk("%s: addr %lx\n", __func__, addr); + trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, + fetch_fault); walk: walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; - if (!is_present_pte(pte)) + pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + trace_kvm_mmu_paging_element(pte, walker->level); + if (!is_present_gpte(pte)) goto not_present; --walker->level; } @@ -150,12 +149,11 @@ walk: pte_gpa += index * sizeof(pt_element_t); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __func__, - walker->level - 1, table_gfn); kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_pte(pte)) + if (!is_present_gpte(pte)) goto not_present; rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); @@ -175,6 +173,8 @@ walk: #endif if (!(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, + sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) @@ -186,18 +186,24 @@ walk: walker->ptes[walker->level - 1] = pte; - if (walker->level == PT_PAGE_TABLE_LEVEL) { - walker->gfn = gpte_to_gfn(pte); - break; - } - - if (walker->level == PT_DIRECTORY_LEVEL - && (pte & PT_PAGE_SIZE_MASK) - && (PTTYPE == 64 || is_pse(vcpu))) { - walker->gfn = gpte_to_gfn_pde(pte); - walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL); - if (PTTYPE == 32 && is_cpuid_PSE36()) + if ((walker->level == PT_PAGE_TABLE_LEVEL) || + ((walker->level == PT_DIRECTORY_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + (PTTYPE == 64 || is_pse(vcpu))) || + ((walker->level == PT_PDPE_LEVEL) && + (pte & PT_PAGE_SIZE_MASK) && + is_long_mode(vcpu))) { + int lvl = walker->level; + + walker->gfn = gpte_to_gfn_lvl(pte, lvl); + walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) + >> PAGE_SHIFT; + + if (PTTYPE == 32 && + walker->level == PT_DIRECTORY_LEVEL && + is_cpuid_PSE36()) walker->gfn += pse36_gfn_delta(pte); + break; } @@ -205,9 +211,10 @@ walk: --walker->level; } - if (write_fault && !is_dirty_pte(pte)) { + if (write_fault && !is_dirty_gpte(pte)) { bool ret; + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); @@ -239,6 +246,7 @@ err: walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } @@ -248,12 +256,11 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int largepage = vcpu->arch.update_pte.largepage; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!is_present_pte(gpte)) - set_shadow_pte(spte, shadow_notrap_nonpresent_pte); + if (!is_present_gpte(gpte)) + __set_spte(spte, shadow_notrap_nonpresent_pte); return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -267,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true); } @@ -276,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int largepage, + int user_fault, int write_fault, int hlevel, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; @@ -289,19 +296,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; - if (!is_present_pte(gw->ptes[gw->level - 1])) + if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL - || (largepage && level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == hlevel) { mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, + ptwrite, level, gw->gfn, pfn, false); break; } @@ -311,16 +317,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } - if (level == PT_DIRECTORY_LEVEL - && gw->level == PT_DIRECTORY_LEVEL) { + if (level <= gw->level) { + int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_pte(gw->ptes[level - 1])) + if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + table_gfn = gpte_to_gfn(gw->ptes[level - delta]); + /* advance table_gfn when emulating 1gb pages with 4k */ + if (delta == 0) + table_gfn += PT_INDEX(addr, level); } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -369,11 +378,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int user_fault = error_code & PFERR_USER_MASK; int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; - u64 *shadow_pte; + u64 *sptep; int write_pt = 0; int r; pfn_t pfn; - int largepage = 0; + int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -399,14 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - if (walker.level == PT_DIRECTORY_LEVEL) { - gfn_t large_gfn; - large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); - if (is_largepage_backed(vcpu, large_gfn)) { - walker.gfn = large_gfn; - largepage = 1; - } + if (walker.level >= PT_DIRECTORY_LEVEL) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); @@ -422,11 +428,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); - + sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, + level, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, - shadow_pte, *shadow_pte, write_pt); + sptep, *sptep, write_pt); if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ @@ -459,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sptep = iterator.sptep; /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || + ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); pte_gpa = (sp->gfn << PAGE_SHIFT); @@ -472,7 +478,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) --vcpu->kvm->stat.lpages; need_flush = 1; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } @@ -489,7 +495,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) return; - if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, @@ -536,7 +542,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); for (j = 0; j < ARRAY_SIZE(pt); ++j) - if (r || is_present_pte(pt[j])) + if (r || is_present_gpte(pt[j])) sp->spt[i+j] = shadow_trap_nonpresent_pte; else sp->spt[i+j] = shadow_notrap_nonpresent_pte; @@ -574,23 +580,23 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_pte(gpte)) + if (is_present_gpte(gpte)) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - set_shadow_pte(&sp->spt[i], nonpresent); + __set_spte(&sp->spt[i], nonpresent); continue; } nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, + is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false); } @@ -603,9 +609,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_BASE_ADDR_MASK #undef PT_INDEX #undef PT_LEVEL_MASK -#undef PT_DIR_BASE_ADDR_MASK +#undef PT_LVL_ADDR_MASK +#undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn -#undef gpte_to_gfn_pde +#undef gpte_to_gfn_lvl #undef CMPXCHG diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b1f658ad2f0..944cc9c04b3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -15,7 +15,6 @@ */ #include <linux/kvm_host.h> -#include "kvm_svm.h" #include "irq.h" #include "mmu.h" #include "kvm_cache_regs.h" @@ -26,10 +25,12 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/ftrace_event.h> #include <asm/desc.h> #include <asm/virtext.h> +#include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -46,6 +47,10 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) /* Turn on to get debugging output*/ @@ -57,6 +62,58 @@ MODULE_LICENSE("GPL"); #define nsvm_printk(fmt, args...) do {} while(0) #endif +static const u32 host_save_user_msrs[] = { +#ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, + MSR_FS_BASE, +#endif + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, +}; + +#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs) + +struct kvm_vcpu; + +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; + + /* cache for intercepts of the guest */ + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + +}; + +struct vcpu_svm { + struct kvm_vcpu vcpu; + struct vmcb *vmcb; + unsigned long vmcb_pa; + struct svm_cpu_data *svm_data; + uint64_t asid_generation; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + + u64 next_rip; + + u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; + u64 host_gs_base; + + u32 *msrpm; + + struct nested_state nested; +}; + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -67,15 +124,14 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 0; +static int nested = 1; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_complete_interrupts(struct vcpu_svm *svm); -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -86,7 +142,22 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool is_nested(struct vcpu_svm *svm) { - return svm->nested_vmcb; + return svm->nested.vmcb; +} + +static inline void enable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + svm->vcpu.arch.hflags &= ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + return !!(svm->vcpu.arch.hflags & HF_GIF_MASK); } static unsigned long iopm_base; @@ -147,19 +218,6 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } -static inline unsigned long kvm_read_cr2(void) -{ - unsigned long cr2; - - asm volatile ("mov %%cr2, %0" : "=r" (cr2)); - return cr2; -} - -static inline void kvm_write_cr2(unsigned long val) -{ - asm volatile ("mov %0, %%cr2" :: "r" (val)); -} - static inline void force_new_asid(struct kvm_vcpu *vcpu) { to_svm(vcpu)->asid_generation--; @@ -263,7 +321,7 @@ static void svm_hardware_enable(void *garbage) struct svm_cpu_data *svm_data; uint64_t efer; - struct desc_ptr gdt_descr; + struct descriptor_table gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -283,8 +341,8 @@ static void svm_hardware_enable(void *garbage) svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - asm volatile ("sgdt %0" : "=m"(gdt_descr)); - gdt = (struct desc_struct *)gdt_descr.address; + kvm_get_gdt(&gdt_descr); + gdt = (struct desc_struct *)gdt_descr.base; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); rdmsrl(MSR_EFER, efer); @@ -367,8 +425,6 @@ static void svm_vcpu_init_msrpm(u32 *msrpm) #endif set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -595,8 +651,10 @@ static void init_vmcb(struct vcpu_svm *svm) } force_new_asid(&svm->vcpu); - svm->nested_vmcb = 0; - svm->vcpu.arch.hflags = HF_GIF_MASK; + svm->nested.vmcb = 0; + svm->vcpu.arch.hflags = 0; + + enable_gif(svm); } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -605,7 +663,7 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); - if (vcpu->vcpu_id != 0) { + if (!kvm_vcpu_is_bsp(vcpu)) { kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; @@ -656,9 +714,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) goto uninit; - svm->hsave = page_address(hsave_page); + svm->nested.hsave = page_address(hsave_page); - svm->nested_msrpm = page_address(nested_msrpm_pages); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); clear_page(svm->vmcb); @@ -669,7 +727,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (svm->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; @@ -688,8 +746,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm->hsave)); - __free_pages(virt_to_page(svm->nested_msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm->nested.hsave)); + __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -740,6 +798,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + switch (reg) { + case VCPU_EXREG_PDPTR: + BUG_ON(!npt_enabled); + load_pdptrs(vcpu, vcpu->arch.cr3); + break; + default: + BUG(); + } +} + static void svm_set_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; @@ -1061,7 +1131,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) val = 0; } - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -1070,8 +1139,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - *exception = 0; switch (dr) { @@ -1119,25 +1186,9 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - if (!npt_enabled) - KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - else - KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - /* - * FIXME: Tis shouldn't be necessary here, but there is a flush - * missing in the MMU code. Until we find this bug, flush the - * complete TLB here on an NPF - */ - if (npt_enabled) - svm_flush_tlb(&svm->vcpu); - else { - if (kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - } + trace_kvm_page_fault(fault_address, error_code); + if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } @@ -1253,14 +1304,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(NMI, &svm->vcpu, handler); return 1; } static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.irq_exits; - KVMTRACE_0D(INTR, &svm->vcpu, handler); return 1; } @@ -1303,44 +1352,39 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - if (is_nested(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> EXCP 0x%x\n", nr); - - nested_svm_vmexit(svm); - return 1; - } - } + if (!is_nested(svm)) + return 0; - return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = error_code; + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + + return nested_svm_exit_handled(svm); } static inline int nested_svm_intr(struct vcpu_svm *svm) { - if (is_nested(svm)) { - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + if (!is_nested(svm)) + return 0; - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return 0; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) + return 0; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk("VMexit -> INTR\n"); - nested_svm_vmexit(svm); - return 1; - } + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm)) { + nsvm_printk("VMexit -> INTR\n"); + return 1; } return 0; } -static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) { struct page *page; @@ -1348,236 +1392,246 @@ static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); up_read(¤t->mm->mmap_sem); - if (is_error_page(page)) { - printk(KERN_INFO "%s: could not find page at 0x%llx\n", - __func__, gpa); - kvm_release_page_clean(page); - kvm_inject_gp(&svm->vcpu, 0); - return NULL; - } - return page; + if (is_error_page(page)) + goto error; + + return kmap_atomic(page, idx); + +error: + kvm_release_page_clean(page); + kvm_inject_gp(&svm->vcpu, 0); + + return NULL; } -static int nested_svm_do(struct vcpu_svm *svm, - u64 arg1_gpa, u64 arg2_gpa, void *opaque, - int (*handler)(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque)) +static void nested_svm_unmap(void *addr, enum km_type idx) { - struct page *arg1_page; - struct page *arg2_page = NULL; - void *arg1; - void *arg2 = NULL; - int retval; + struct page *page; - arg1_page = nested_svm_get_page(svm, arg1_gpa); - if(arg1_page == NULL) - return 1; + if (!addr) + return; - if (arg2_gpa) { - arg2_page = nested_svm_get_page(svm, arg2_gpa); - if(arg2_page == NULL) { - kvm_release_page_clean(arg1_page); - return 1; - } - } + page = kmap_atomic_to_page(addr); + + kunmap_atomic(addr, idx); + kvm_release_page_dirty(page); +} + +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) +{ + u32 param = svm->vmcb->control.exit_info_1 & 1; + u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + bool ret = false; + u32 t0, t1; + u8 *msrpm; - arg1 = kmap_atomic(arg1_page, KM_USER0); - if (arg2_gpa) - arg2 = kmap_atomic(arg2_page, KM_USER1); + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return false; - retval = handler(svm, arg1, arg2, opaque); + msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + + if (!msrpm) + goto out; + + switch (msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc0000000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc0000000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc0010000 ... 0xc0011fff: + t0 = (16384 + msr - 0xc0010000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + ret = true; + goto out; + } - kunmap_atomic(arg1, KM_USER0); - if (arg2_gpa) - kunmap_atomic(arg2, KM_USER1); + ret = msrpm[t1] & ((1 << param) << t0); - kvm_release_page_dirty(arg1_page); - if (arg2_gpa) - kvm_release_page_dirty(arg2_page); +out: + nested_svm_unmap(msrpm, KM_USER0); - return retval; + return ret; } -static int nested_svm_exit_handled_real(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque) +static int nested_svm_exit_special(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm->vmcb->control.exit_code; - if (kvm_overrides) { - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - return 0; + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return NESTED_EXIT_HOST; /* For now we are always handling NPFs when using them */ - case SVM_EXIT_NPF: - if (npt_enabled) - return 0; - break; - /* When we're shadowing, trap PFs */ - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - if (!npt_enabled) - return 0; - break; - default: - break; - } + case SVM_EXIT_NPF: + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return NESTED_EXIT_HOST; + break; + default: + break; } + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + u32 exit_code = svm->vmcb->control.exit_code; + int vmexit = NESTED_EXIT_HOST; + switch (exit_code) { + case SVM_EXIT_MSR: + vmexit = nested_svm_exit_handled_msr(svm); + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); - if (nested_vmcb->control.intercept_cr_read & cr_bits) - return 1; + if (svm->nested.intercept_cr_read & cr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0); - if (nested_vmcb->control.intercept_cr_write & cr_bits) - return 1; + if (svm->nested.intercept_cr_write & cr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0); - if (nested_vmcb->control.intercept_dr_read & dr_bits) - return 1; + if (svm->nested.intercept_dr_read & dr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); - if (nested_vmcb->control.intercept_dr_write & dr_bits) - return 1; + if (svm->nested.intercept_dr_write & dr_bits) + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (nested_vmcb->control.intercept_exceptions & excp_bits) - return 1; + if (svm->nested.intercept_exceptions & excp_bits) + vmexit = NESTED_EXIT_DONE; break; } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); nsvm_printk("exit code: 0x%x\n", exit_code); - if (nested_vmcb->control.intercept & exit_bits) - return 1; + if (svm->nested.intercept & exit_bits) + vmexit = NESTED_EXIT_DONE; } } - return 0; -} - -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u32 param = svm->vmcb->control.exit_info_1 & 1; - - if (!(nested_vmcb->control.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return 0; - - switch(msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc0000000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc0000000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc0010000 ... 0xc0011fff: - t0 = (16384 + msr - 0xc0010000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - return 1; - break; + if (vmexit == NESTED_EXIT_DONE) { + nsvm_printk("#VMEXIT reason=%04x\n", exit_code); + nested_svm_vmexit(svm); } - if (msrpm[t1] & ((1 << param) << t0)) - return 1; - return 0; + return vmexit; +} + +static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = &dst_vmcb->control; + struct vmcb_control_area *from = &from_vmcb->control; + + dst->intercept_cr_read = from->intercept_cr_read; + dst->intercept_cr_write = from->intercept_cr_write; + dst->intercept_dr_read = from->intercept_dr_read; + dst->intercept_dr_write = from->intercept_dr_write; + dst->intercept_exceptions = from->intercept_exceptions; + dst->intercept = from->intercept; + dst->iopm_base_pa = from->iopm_base_pa; + dst->msrpm_base_pa = from->msrpm_base_pa; + dst->tsc_offset = from->tsc_offset; + dst->asid = from->asid; + dst->tlb_ctl = from->tlb_ctl; + dst->int_ctl = from->int_ctl; + dst->int_vector = from->int_vector; + dst->int_state = from->int_state; + dst->exit_code = from->exit_code; + dst->exit_code_hi = from->exit_code_hi; + dst->exit_info_1 = from->exit_info_1; + dst->exit_info_2 = from->exit_info_2; + dst->exit_int_info = from->exit_int_info; + dst->exit_int_info_err = from->exit_int_info_err; + dst->nested_ctl = from->nested_ctl; + dst->event_inj = from->event_inj; + dst->event_inj_err = from->event_inj_err; + dst->nested_cr3 = from->nested_cr3; + dst->lbr_ctl = from->lbr_ctl; } -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +static int nested_svm_vmexit(struct vcpu_svm *svm) { - bool k = kvm_override; - - switch (svm->vmcb->control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm->nested_vmcb, - svm->nested_vmcb_msrpm, NULL, - nested_svm_exit_handled_msr); - default: break; - } + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; - return nested_svm_do(svm, svm->nested_vmcb, 0, &k, - nested_svm_exit_handled_real); -} - -static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; - u64 nested_save[] = { nested_vmcb->save.cr0, - nested_vmcb->save.cr3, - nested_vmcb->save.cr4, - nested_vmcb->save.efer, - nested_vmcb->control.intercept_cr_read, - nested_vmcb->control.intercept_cr_write, - nested_vmcb->control.intercept_dr_read, - nested_vmcb->control.intercept_dr_write, - nested_vmcb->control.intercept_exceptions, - nested_vmcb->control.intercept, - nested_vmcb->control.msrpm_base_pa, - nested_vmcb->control.iopm_base_pa, - nested_vmcb->control.tsc_offset }; + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); + if (!nested_vmcb) + return 1; /* Give the current vmcb to the guest */ - memcpy(nested_vmcb, svm->vmcb, sizeof(struct vmcb)); - nested_vmcb->save.cr0 = nested_save[0]; - if (!npt_enabled) - nested_vmcb->save.cr3 = nested_save[1]; - nested_vmcb->save.cr4 = nested_save[2]; - nested_vmcb->save.efer = nested_save[3]; - nested_vmcb->control.intercept_cr_read = nested_save[4]; - nested_vmcb->control.intercept_cr_write = nested_save[5]; - nested_vmcb->control.intercept_dr_read = nested_save[6]; - nested_vmcb->control.intercept_dr_write = nested_save[7]; - nested_vmcb->control.intercept_exceptions = nested_save[8]; - nested_vmcb->control.intercept = nested_save[9]; - nested_vmcb->control.msrpm_base_pa = nested_save[10]; - nested_vmcb->control.iopm_base_pa = nested_save[11]; - nested_vmcb->control.tsc_offset = nested_save[12]; + disable_gif(svm); + + nested_vmcb->save.es = vmcb->save.es; + nested_vmcb->save.cs = vmcb->save.cs; + nested_vmcb->save.ss = vmcb->save.ss; + nested_vmcb->save.ds = vmcb->save.ds; + nested_vmcb->save.gdtr = vmcb->save.gdtr; + nested_vmcb->save.idtr = vmcb->save.idtr; + if (npt_enabled) + nested_vmcb->save.cr3 = vmcb->save.cr3; + nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rip = vmcb->save.rip; + nested_vmcb->save.rsp = vmcb->save.rsp; + nested_vmcb->save.rax = vmcb->save.rax; + nested_vmcb->save.dr7 = vmcb->save.dr7; + nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.cpl = vmcb->save.cpl; + + nested_vmcb->control.int_ctl = vmcb->control.int_ctl; + nested_vmcb->control.int_vector = vmcb->control.int_vector; + nested_vmcb->control.int_state = vmcb->control.int_state; + nested_vmcb->control.exit_code = vmcb->control.exit_code; + nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi; + nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1; + nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; + nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; + nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.tlb_ctl = 0; + nested_vmcb->control.event_inj = 0; + nested_vmcb->control.event_inj_err = 0; /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK; - if ((nested_vmcb->control.int_ctl & V_IRQ_MASK) && - (nested_vmcb->control.int_vector)) { - nsvm_printk("WARNING: IRQ 0x%x still enabled on #VMEXIT\n", - nested_vmcb->control.int_vector); - } - /* Restore the original control entries */ - svm->vmcb->control = hsave->control; + copy_vmcb_control_area(vmcb, hsave); /* Kill any pending exceptions */ if (svm->vcpu.arch.exception.pending == true) nsvm_printk("WARNING: Pending Exception\n"); - svm->vcpu.arch.exception.pending = false; + + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; @@ -1603,19 +1657,10 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; /* Exit nested SVM mode */ - svm->nested_vmcb = 0; + svm->nested.vmcb = 0; - return 0; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - nsvm_printk("VMexit\n"); - if (nested_svm_do(svm, svm->nested_vmcb, 0, - NULL, nested_svm_vmexit_real)) - return 1; + nested_svm_unmap(nested_vmcb, KM_USER0); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -1623,38 +1668,63 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } -static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + u32 *nested_msrpm; int i; - u32 *nested_msrpm = (u32*)arg1; + + nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + if (!nested_msrpm) + return false; + for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) - svm->nested_msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested_msrpm); + svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; - return 0; + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + + nested_svm_unmap(nested_msrpm, KM_USER0); + + return true; } -static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm->hsave; + struct vmcb *nested_vmcb; + struct vmcb *hsave = svm->nested.hsave; + struct vmcb *vmcb = svm->vmcb; + + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return false; /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested_vmcb = svm->vmcb->save.rax; + svm->nested.vmcb = svm->vmcb->save.rax; /* Clear internal status */ - svm->vcpu.arch.exception.pending = false; + kvm_clear_exception_queue(&svm->vcpu); + kvm_clear_interrupt_queue(&svm->vcpu); /* Save the old vmcb, so we don't need to pick what we save, but can restore everything when a VMEXIT occurs */ - memcpy(hsave, svm->vmcb, sizeof(struct vmcb)); - /* We need to remember the original CR3 in the SPT case */ - if (!npt_enabled) - hsave->save.cr3 = svm->vcpu.arch.cr3; - hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rip = svm->next_rip; + hsave->save.es = vmcb->save.es; + hsave->save.cs = vmcb->save.cs; + hsave->save.ss = vmcb->save.ss; + hsave->save.ds = vmcb->save.ds; + hsave->save.gdtr = vmcb->save.gdtr; + hsave->save.idtr = vmcb->save.idtr; + hsave->save.efer = svm->vcpu.arch.shadow_efer; + hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr4 = svm->vcpu.arch.cr4; + hsave->save.rflags = vmcb->save.rflags; + hsave->save.rip = svm->next_rip; + hsave->save.rsp = vmcb->save.rsp; + hsave->save.rax = vmcb->save.rax; + if (npt_enabled) + hsave->save.cr3 = vmcb->save.cr3; + else + hsave->save.cr3 = svm->vcpu.arch.cr3; + + copy_vmcb_control_area(hsave, vmcb); if (svm->vmcb->save.rflags & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; @@ -1679,7 +1749,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); kvm_mmu_reset_context(&svm->vcpu); } - svm->vmcb->save.cr2 = nested_vmcb->save.cr2; + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); @@ -1706,7 +1776,15 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - svm->nested_vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + + /* cache intercepts */ + svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; + svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; + svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read; + svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write; + svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; + svm->nested.intercept = nested_vmcb->control.intercept; force_new_asid(&svm->vcpu); svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; @@ -1734,12 +1812,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - svm->vcpu.arch.hflags |= HF_GIF_MASK; + nested_svm_unmap(nested_vmcb, KM_USER0); - return 0; + enable_gif(svm); + + return true; } -static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1753,44 +1833,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs; to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp; to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; - - return 1; -} - -static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm->vmcb); -} - -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, - void *arg2, void *opaque) -{ - return nested_svm_vmloadsave(svm->vmcb, (struct vmcb *)nested_vmcb); } static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmload); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_svm_do(svm, svm->vmcb->save.rax, 0, NULL, nested_svm_vmsave); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } @@ -1798,19 +1878,29 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { nsvm_printk("VMrun\n"); + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - if (nested_svm_do(svm, svm->vmcb->save.rax, 0, - NULL, nested_svm_vmrun)) + if (!nested_svm_vmrun(svm)) return 1; - if (nested_svm_do(svm, svm->nested_vmcb_msrpm, 0, - NULL, nested_svm_vmrun_msrpm)) - return 1; + if (!nested_svm_vmrun_msrpm(svm)) + goto failed; + + return 1; + +failed: + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); return 1; } @@ -1823,7 +1913,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 1; } @@ -1836,7 +1926,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - svm->vcpu.arch.hflags &= ~HF_GIF_MASK; + disable_gif(svm); /* After a CLGI no interrupts should come */ svm_clear_vintr(svm); @@ -1845,6 +1935,19 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + nsvm_printk("INVLPGA\n"); + + /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ + kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); + + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + return 1; +} + static int invalid_op_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1953,7 +2056,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TSC: { u64 tsc; rdtscll(tsc); @@ -1981,10 +2084,10 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->vmcb->save.sysenter_eip; + *data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->vmcb->save.sysenter_esp; + *data = svm->sysenter_esp; break; /* Nobody will change the following 5 values in the VMCB so we can safely return them on rdmsr. They will always be 0 @@ -2005,7 +2108,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->hsave_msr; + *data = svm->nested.hsave_msr; break; case MSR_VM_CR: *data = 0; @@ -2027,8 +2130,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { - KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, - (u32)(data >> 32), handler); + trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; @@ -2043,7 +2145,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TSC: { u64 tsc; rdtscll(tsc); @@ -2071,9 +2173,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_cs = data; break; case MSR_IA32_SYSENTER_EIP: + svm->sysenter_eip = data; svm->vmcb->save.sysenter_eip = data; break; case MSR_IA32_SYSENTER_ESP: + svm->sysenter_esp = data; svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: @@ -2091,24 +2195,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) else svm_disable_lbrv(svm); break; - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); - - break; case MSR_VM_HSAVE_PA: - svm->hsave_msr = data; + svm->nested.hsave_msr = data; + break; + case MSR_VM_CR: + case MSR_VM_IGNNE: + pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); @@ -2122,8 +2214,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) @@ -2144,8 +2235,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2201,7 +2290,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, - [SVM_EXIT_INVLPGA] = invalid_op_interception, + [SVM_EXIT_INVLPGA] = invlpga_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, @@ -2224,20 +2313,26 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; - KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, - (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (is_nested(svm)) { + int vmexit; + nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", exit_code, svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); - if (nested_svm_exit_handled(svm, true)) { - nested_svm_vmexit(svm); - nsvm_printk("-> #VMEXIT\n"); + + vmexit = nested_svm_exit_special(svm); + + if (vmexit == NESTED_EXIT_CONTINUE) + vmexit = nested_svm_exit_handled(svm); + + if (vmexit == NESTED_EXIT_DONE) return 1; - } } + svm_complete_interrupts(svm); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -2246,12 +2341,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } vcpu->arch.cr0 = svm->vmcb->save.cr0; vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - } if (mmu_reload) { kvm_mmu_reset_context(vcpu); kvm_mmu_load(vcpu); @@ -2319,7 +2408,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; @@ -2329,21 +2418,14 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); } -static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.event_inj = nr | - SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; -} - static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); + BUG_ON(!(gif_set(svm))); - svm_queue_irq(vcpu, vcpu->arch.interrupt.nr); + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | + SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) @@ -2371,13 +2453,25 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; return (vmcb->save.rflags & X86_EFLAGS_IF) && !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - (svm->vcpu.arch.hflags & HF_GIF_MASK); + gif_set(svm) && + !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); } static void enable_irq_window(struct kvm_vcpu *vcpu) { - svm_set_vintr(to_svm(vcpu)); - svm_inject_irq(to_svm(vcpu), 0x0); + struct vcpu_svm *svm = to_svm(vcpu); + nsvm_printk("Trying to open IRQ window\n"); + + nested_svm_intr(svm); + + /* In case GIF=0 we can't rely on the CPU to tell us when + * GIF becomes 1, because that's a separate STGI/VMRUN intercept. + * The next time we get that intercept, this function will be + * called again though and we'll get the vintr intercept. */ + if (gif_set(svm)) { + svm_set_vintr(svm); + svm_inject_irq(svm, 0x0); + } } static void enable_nmi_window(struct kvm_vcpu *vcpu) @@ -2456,6 +2550,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) case SVM_EXITINTINFO_TYPE_EXEPT: /* In case of software exception do not reinject an exception vector, but re-execute and instruction instead */ + if (is_nested(svm)) + break; if (kvm_exception_is_soft(vector)) break; if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { @@ -2498,9 +2594,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - svm->host_cr2 = kvm_read_cr2(); - if (!is_nested(svm)) - svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm->vmcb->save.cr3 = vcpu->arch.cr3; @@ -2585,8 +2679,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_write_cr2(svm->host_cr2); - kvm_load_fs(fs_selector); kvm_load_gs(gs_selector); kvm_load_ldt(ldt_selector); @@ -2602,7 +2694,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; - svm_complete_interrupts(svm); + if (npt_enabled) { + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } } #undef R @@ -2673,6 +2768,64 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static const struct trace_print_flags svm_exit_reasons_str[] = { + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, + { SVM_EXIT_INTR, "interrupt" }, + { SVM_EXIT_NMI, "nmi" }, + { SVM_EXIT_SMI, "smi" }, + { SVM_EXIT_INIT, "init" }, + { SVM_EXIT_VINTR, "vintr" }, + { SVM_EXIT_CPUID, "cpuid" }, + { SVM_EXIT_INVD, "invd" }, + { SVM_EXIT_HLT, "hlt" }, + { SVM_EXIT_INVLPG, "invlpg" }, + { SVM_EXIT_INVLPGA, "invlpga" }, + { SVM_EXIT_IOIO, "io" }, + { SVM_EXIT_MSR, "msr" }, + { SVM_EXIT_TASK_SWITCH, "task_switch" }, + { SVM_EXIT_SHUTDOWN, "shutdown" }, + { SVM_EXIT_VMRUN, "vmrun" }, + { SVM_EXIT_VMMCALL, "hypercall" }, + { SVM_EXIT_VMLOAD, "vmload" }, + { SVM_EXIT_VMSAVE, "vmsave" }, + { SVM_EXIT_STGI, "stgi" }, + { SVM_EXIT_CLGI, "clgi" }, + { SVM_EXIT_SKINIT, "skinit" }, + { SVM_EXIT_WBINVD, "wbinvd" }, + { SVM_EXIT_MONITOR, "monitor" }, + { SVM_EXIT_MWAIT, "mwait" }, + { SVM_EXIT_NPF, "npf" }, + { -1, NULL } +}; + +static bool svm_gb_page_enable(void) +{ + return true; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2710,6 +2863,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, + .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -2733,6 +2887,9 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, + + .exit_reasons_str = svm_exit_reasons_str, + .gb_page_enable = svm_gb_page_enable, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 86dbac072d0..eea40439066 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -9,12 +9,16 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) int restart_timer = 0; wait_queue_head_t *q = &vcpu->wq; - /* FIXME: this code should not know anything about vcpus */ - if (!atomic_inc_and_test(&ktimer->pending)) + /* + * There is a race window between reading and incrementing, but we do + * not care about potentially loosing timer events in the !reinject + * case anyway. + */ + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + /* FIXME: this code should not know anything about vcpus */ set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); - - if (!ktimer->reinject) - atomic_set(&ktimer->pending, 1); + } if (waitqueue_active(q)) wake_up_interruptible(q); @@ -33,7 +37,7 @@ enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) struct kvm_vcpu *vcpu; struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id]; + vcpu = ktimer->vcpu; if (!vcpu) return HRTIMER_NORESTART; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 00000000000..0d480e77eac --- /dev/null +++ b/arch/x86/kvm/trace.h @@ -0,0 +1,355 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %u", __entry->vcpu_id) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3), + TP_ARGS(nr, a0, a1, a2, a3), + + TP_STRUCT__entry( + __field( unsigned long, nr ) + __field( unsigned long, a0 ) + __field( unsigned long, a1 ) + __field( unsigned long, a2 ) + __field( unsigned long, a3 ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3) +); + +/* + * Tracepoint for PIO. + */ +TRACE_EVENT(kvm_pio, + TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, + unsigned int count), + TP_ARGS(rw, port, size, count), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, port ) + __field( unsigned int, size ) + __field( unsigned int, count ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->port = port; + __entry->size = size; + __entry->count = count; + ), + + TP_printk("pio_%s at 0x%x size %d count %d", + __entry->rw ? "write" : "read", + __entry->port, __entry->size, __entry->count) +); + +/* + * Tracepoint for cpuid. + */ +TRACE_EVENT(kvm_cpuid, + TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, + unsigned long rcx, unsigned long rdx), + TP_ARGS(function, rax, rbx, rcx, rdx), + + TP_STRUCT__entry( + __field( unsigned int, function ) + __field( unsigned long, rax ) + __field( unsigned long, rbx ) + __field( unsigned long, rcx ) + __field( unsigned long, rdx ) + ), + + TP_fast_assign( + __entry->function = function; + __entry->rax = rax; + __entry->rbx = rbx; + __entry->rcx = rcx; + __entry->rdx = rdx; + ), + + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + __entry->function, __entry->rax, + __entry->rbx, __entry->rcx, __entry->rdx) +); + +#define AREG(x) { APIC_##x, "APIC_" #x } + +#define kvm_trace_symbol_apic \ + AREG(ID), AREG(LVR), AREG(TASKPRI), AREG(ARBPRI), AREG(PROCPRI), \ + AREG(EOI), AREG(RRR), AREG(LDR), AREG(DFR), AREG(SPIV), AREG(ISR), \ + AREG(TMR), AREG(IRR), AREG(ESR), AREG(ICR), AREG(ICR2), AREG(LVTT), \ + AREG(LVTTHMR), AREG(LVTPC), AREG(LVT0), AREG(LVT1), AREG(LVTERR), \ + AREG(TMICT), AREG(TMCCT), AREG(TDCR), AREG(SELF_IPI), AREG(EFEAT), \ + AREG(ECTRL) +/* + * Tracepoint for apic access. + */ +TRACE_EVENT(kvm_apic, + TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_ARGS(rw, reg, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, reg ) + __field( unsigned int, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->reg = reg; + __entry->val = val; + ), + + TP_printk("apic_%s %s = 0x%x", + __entry->rw ? "write" : "read", + __print_symbolic(__entry->reg, kvm_trace_symbol_apic), + __entry->val) +); + +#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) +#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) + +/* + * Tracepoint for kvm guest exit: + */ +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), + TP_ARGS(exit_reason, guest_rip), + + TP_STRUCT__entry( + __field( unsigned int, exit_reason ) + __field( unsigned long, guest_rip ) + ), + + TP_fast_assign( + __entry->exit_reason = exit_reason; + __entry->guest_rip = guest_rip; + ), + + TP_printk("reason %s rip 0x%lx", + ftrace_print_symbols_seq(p, __entry->exit_reason, + kvm_x86_ops->exit_reasons_str), + __entry->guest_rip) +); + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_virq, + TP_PROTO(unsigned int irq), + TP_ARGS(irq), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + ), + + TP_fast_assign( + __entry->irq = irq; + ), + + TP_printk("irq %u", __entry->irq) +); + +/* + * Tracepoint for page fault. + */ +TRACE_EVENT(kvm_page_fault, + TP_PROTO(unsigned long fault_address, unsigned int error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) + __field( unsigned int, error_code ) + ), + + TP_fast_assign( + __entry->fault_address = fault_address; + __entry->error_code = error_code; + ), + + TP_printk("address %lx error_code %x", + __entry->fault_address, __entry->error_code) +); + +/* + * Tracepoint for guest MSR access. + */ +TRACE_EVENT(kvm_msr, + TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), + TP_ARGS(rw, ecx, data), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, ecx ) + __field( unsigned long, data ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->ecx = ecx; + __entry->data = data; + ), + + TP_printk("msr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->ecx, __entry->data) +); + +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) + +/* + * Tracepoint for guest CR access. + */ +TRACE_EVENT(kvm_cr, + TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), + TP_ARGS(rw, cr, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, cr ) + __field( unsigned long, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->cr = cr; + __entry->val = val; + ), + + TP_printk("cr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->cr, __entry->val) +); + +#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) +#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) + +TRACE_EVENT(kvm_pic_set_irq, + TP_PROTO(__u8 chip, __u8 pin, __u8 elcr, __u8 imr, bool coalesced), + TP_ARGS(chip, pin, elcr, imr, coalesced), + + TP_STRUCT__entry( + __field( __u8, chip ) + __field( __u8, pin ) + __field( __u8, elcr ) + __field( __u8, imr ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->chip = chip; + __entry->pin = pin; + __entry->elcr = elcr; + __entry->imr = imr; + __entry->coalesced = coalesced; + ), + + TP_printk("chip %u pin %u (%s%s)%s", + __entry->chip, __entry->pin, + (__entry->elcr & (1 << __entry->pin)) ? "level":"edge", + (__entry->imr & (1 << __entry->pin)) ? "|masked":"", + __entry->coalesced ? " (coalesced)" : "") +); + +#define kvm_apic_dst_shorthand \ + {0x0, "dst"}, \ + {0x1, "self"}, \ + {0x2, "all"}, \ + {0x3, "all-but-self"} + +TRACE_EVENT(kvm_apic_ipi, + TP_PROTO(__u32 icr_low, __u32 dest_id), + TP_ARGS(icr_low, dest_id), + + TP_STRUCT__entry( + __field( __u32, icr_low ) + __field( __u32, dest_id ) + ), + + TP_fast_assign( + __entry->icr_low = icr_low; + __entry->dest_id = dest_id; + ), + + TP_printk("dst %x vec %u (%s|%s|%s|%s|%s)", + __entry->dest_id, (u8)__entry->icr_low, + __print_symbolic((__entry->icr_low >> 8 & 0x7), + kvm_deliver_mode), + (__entry->icr_low & (1<<11)) ? "logical" : "physical", + (__entry->icr_low & (1<<14)) ? "assert" : "de-assert", + (__entry->icr_low & (1<<15)) ? "level" : "edge", + __print_symbolic((__entry->icr_low >> 18 & 0x3), + kvm_apic_dst_shorthand)) +); + +TRACE_EVENT(kvm_apic_accept_irq, + TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced), + TP_ARGS(apicid, dm, tm, vec, coalesced), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( __u16, dm ) + __field( __u8, tm ) + __field( __u8, vec ) + __field( bool, coalesced ) + ), + + TP_fast_assign( + __entry->apicid = apicid; + __entry->dm = dm; + __entry->tm = tm; + __entry->vec = vec; + __entry->coalesced = coalesced; + ), + + TP_printk("apicid %x vec %u (%s|%s)%s", + __entry->apicid, __entry->vec, + __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode), + __entry->tm ? "level" : "edge", + __entry->coalesced ? " (coalesced)" : "") +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f912927a5..f3812014bd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -25,6 +25,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/moduleparam.h> +#include <linux/ftrace_event.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,8 @@ #include <asm/virtext.h> #include <asm/mce.h> +#include "trace.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -51,6 +54,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); static int __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); +static int __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -84,6 +91,14 @@ struct vcpu_vmx { int guest_efer_loaded; } host_state; struct { + int vm86_active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; struct { bool pending; u8 vector; @@ -161,6 +176,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. @@ -256,6 +273,26 @@ static inline bool cpu_has_vmx_flexpriority(void) cpu_has_vmx_virtualize_apic_accesses(); } +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); +} + +static inline bool cpu_has_vmx_eptp_uncacheable(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); +} + +static inline bool cpu_has_vmx_eptp_writeback(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -277,6 +314,12 @@ static inline int cpu_has_vmx_ept(void) SECONDARY_EXEC_ENABLE_EPT; } +static inline int cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -497,14 +540,16 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; + /* + * Unconditionally intercept #DB so we can maintain dr6 without + * reading it every exit. + */ + eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - eb |= 1u << DB_VECTOR; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ @@ -528,12 +573,15 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx->msr_offset_efer; - u64 host_efer = vmx->host_msrs[efer_offset].data; - u64 guest_efer = vmx->guest_msrs[efer_offset].data; + u64 host_efer; + u64 guest_efer; u64 ignore_bits; if (efer_offset < 0) return; + host_efer = vmx->host_msrs[efer_offset].data; + guest_efer = vmx->guest_msrs[efer_offset].data; + /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless * outside long mode @@ -735,12 +783,17 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - return vmcs_readl(GUEST_RFLAGS); + unsigned long rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) + rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -797,12 +850,13 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (nr == BP_VECTOR || nr == OF_VECTOR) - vmx->rmode.irq.rip++; + if (kvm_exception_is_soft(nr)) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; intr_info |= INTR_TYPE_SOFT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); @@ -940,7 +994,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); #endif - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: @@ -953,9 +1007,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: - vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { + vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -1000,22 +1054,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_SYSENTER_ESP: vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_TIME_STAMP_COUNTER: + case MSR_IA32_TSC: rdtscll(host_tsc); guest_write_tsc(data, host_tsc); break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - /* - * Just discard all writes to the performance counters; this - * should keep both older linux and windows 64-bit guests - * happy - */ - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); - - break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, data); @@ -1024,9 +1066,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } /* Otherwise falls through to kvm_set_msr_common */ default: - vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); if (msr) { + vmx_load_host_state(vmx); msr->data = data; break; } @@ -1046,6 +1088,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; default: break; } @@ -1203,7 +1249,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT; + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1217,12 +1264,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ - min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) - return -EIO; + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, vmx_capability.ept, vmx_capability.vpid); } @@ -1333,8 +1377,13 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) + if (!cpu_has_vmx_ept()) { enable_ept = 0; + enable_unrestricted_guest = 0; + } + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; @@ -1342,6 +1391,9 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_tpr_shadow()) kvm_x86_ops->update_cr8_intercept = NULL; + if (enable_ept && !cpu_has_vmx_ept_2m_page()) + kvm_disable_largepages(); + return alloc_kvm_area(); } @@ -1372,15 +1424,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1391,10 +1443,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) return; - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1433,20 +1485,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_unrestricted_guest) + return; + vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 1; + vmx->rmode.vm86_active = 1; - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl + vmx->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1468,10 +1523,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -1545,11 +1600,11 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); - return; - } vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); @@ -1557,6 +1612,21 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) } } +static void ept_save_pdptrs(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); +} + static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -1571,8 +1641,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; - *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1581,9 +1649,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - if (!(vcpu->arch.cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; } + + if (!(cr0 & X86_CR0_WP)) + *hw_cr0 &= ~X86_CR0_WP; } static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, @@ -1598,15 +1667,21 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | - KVM_VM_CR0_ALWAYS_ON; + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr0; + + if (enable_unrestricted_guest) + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) + | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1650,10 +1725,8 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); - ept_sync_context(eptp); - ept_load_pdptrs(vcpu); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : - VMX_EPT_IDENTITY_PAGETABLE_ADDR; + vcpu->kvm->arch.ept_identity_map_addr; } vmx_flush_tlb(vcpu); @@ -1664,7 +1737,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? + unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; @@ -1707,16 +1780,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ return 3; - vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); - return kvm_seg.selector & 3; + return vmcs_read16(GUEST_CS_SELECTOR) & 3; } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -1744,20 +1814,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.vm86_active && var->s) { + if (vmx->rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -1766,6 +1837,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar = 0xf3; } else ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ + vmcs_write32(sf->ar_bytes, ar); } @@ -2040,7 +2126,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (likely(kvm->arch.ept_identity_pagetable_done)) return 1; ret = 0; - identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2062,11 +2148,19 @@ out: static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); + if (enable_unrestricted_guest) { + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + } else + ar = 0xf3; + + vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) @@ -2101,14 +2195,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: up_write(&kvm->slots_lock); return r; @@ -2209,6 +2304,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -2326,14 +2423,14 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); @@ -2344,7 +2441,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. */ - if (vmx->vcpu.vcpu_id == 0) { + if (kvm_vcpu_is_bsp(&vmx->vcpu)) { vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0x000f0000); } else { @@ -2373,7 +2470,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_SYSENTER_EIP, 0); vmcs_writel(GUEST_RFLAGS, 0x02); - if (vmx->vcpu.vcpu_id == 0) + if (kvm_vcpu_is_bsp(&vmx->vcpu)) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); @@ -2461,13 +2558,16 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (vcpu->arch.interrupt.soft) + vmx->rmode.irq.rip += + vmx->vcpu.arch.event_exit_inst_len; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); @@ -2502,7 +2602,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2659,14 +2759,14 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); - KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, - (u32)((u64)cr2 >> 32), handler); + trace_kvm_page_fault(cr2, error_code); + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.vm86_active && + if (vmx->rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { @@ -2707,7 +2807,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; - KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -2755,7 +2854,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - unsigned long exit_qualification; + unsigned long exit_qualification, val; int cr; int reg; @@ -2764,21 +2863,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr0(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 3: - kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr3(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr4(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 8: { @@ -2800,23 +2897,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); - KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: kvm_register_write(vcpu, reg, vcpu->arch.cr3); - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: - kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); - KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), handler); + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } @@ -2841,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; + if (!kvm_require_cpl(vcpu, 0)) + return 1; dr = vmcs_readl(GUEST_DR7); if (dr & DR7_GD) { /* @@ -2884,7 +2979,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } kvm_register_write(vcpu, reg, val); - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { val = vcpu->arch.regs[reg]; switch (dr) { @@ -2917,7 +3011,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } break; } - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -2939,8 +3032,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_read(ecx, data); /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; @@ -2955,8 +3047,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); @@ -2983,7 +3074,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - KVMTRACE_0D(PEND_INTR, vcpu, handler); ++vcpu->stat.irq_window_exits; /* @@ -3049,7 +3139,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "Fail to handle apic access vmexit! Offset is 0x%lx\n", offset); - return -ENOTSUPP; + return -ENOEXEC; } return 1; } @@ -3118,7 +3208,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (exit_qualification & (1 << 6)) { printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -ENOTSUPP; + return -EINVAL; } gla_validity = (exit_qualification >> 7) & 0x3; @@ -3130,14 +3220,98 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = 0; - return -ENOTSUPP; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + return 0; } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } +static u64 ept_rsvd_mask(u64 spte, int level) +{ + int i; + u64 mask = 0; + + for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) + mask |= (1ULL << i); + + if (level > 2) + /* bits 7:3 reserved */ + mask |= 0xf8; + else if (level == 2) { + if (spte & (1ULL << 7)) + /* 2MB ref, bits 20:12 reserved */ + mask |= 0x1ff000; + else + /* bits 6:3 reserved */ + mask |= 0x78; + } + + return mask; +} + +static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, + int level) +{ + printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); + + /* 010b (write-only) */ + WARN_ON((spte & 0x7) == 0x2); + + /* 110b (write/execute) */ + WARN_ON((spte & 0x7) == 0x6); + + /* 100b (execute-only) and value not supported by logical processor */ + if (!cpu_has_vmx_ept_execute_only()) + WARN_ON((spte & 0x7) == 0x4); + + /* not 000b */ + if ((spte & 0x7)) { + u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); + + if (rsvd_bits != 0) { + printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", + __func__, rsvd_bits); + WARN_ON(1); + } + + if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) { + u64 ept_mem_type = (spte & 0x38) >> 3; + + if (ept_mem_type == 2 || ept_mem_type == 3 || + ept_mem_type == 7) { + printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", + __func__, ept_mem_type); + WARN_ON(1); + } + } + } +} + +static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 sptes[4]; + int nr_sptes, i; + gpa_t gpa; + + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + + printk(KERN_ERR "EPT: Misconfiguration.\n"); + printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); + + nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); + + for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) + ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); + + kvm_run->exit_reason = KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + + return 0; +} + static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u32 cpu_based_vm_exec_control; @@ -3217,8 +3391,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, }; static const int kvm_vmx_max_exit_handlers = @@ -3234,8 +3409,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), - (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ @@ -3247,10 +3421,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (enable_ept && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - ept_load_pdptrs(vcpu); - } if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -3326,10 +3498,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, &vmx->vcpu, handler); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3434,6 +3604,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_ept && is_paging(vcpu)) { + vmcs_writel(GUEST_CR3, vcpu->arch.cr3); + ept_load_pdptrs(vcpu); + } /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) vmx->entry_time = ktime_get(); @@ -3449,12 +3623,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* When single-stepping over STI and MOV SS, we must clear the + * corresponding interruptibility bits in the guest state. Otherwise + * vmentry fails as it then expects bit 14 (BS) in pending debug + * exceptions being set, but that's not correct for the guest debugging + * case. */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vmx_set_interrupt_shadow(vcpu, 0); + /* * Loading guest fpu may have cleared host cr0.ts */ vmcs_writel(HOST_CR0, read_cr0()); - set_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + set_debugreg(vcpu->arch.dr6, 6); asm( /* Store host registers */ @@ -3465,11 +3648,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" + /* Reload cr2 if changed */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%cr2, %%"R"dx \n\t" + "cmp %%"R"ax, %%"R"dx \n\t" + "je 2f \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%"R"ax, %%cr2 \n\t" "mov %c[rax](%0), %%"R"ax \n\t" "mov %c[rbx](%0), %%"R"bx \n\t" "mov %c[rdx](%0), %%"R"dx \n\t" @@ -3547,10 +3735,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; - get_debugreg(vcpu->arch.dr6, 6); + if (vcpu->arch.switch_db_regs) + get_debugreg(vcpu->arch.dr6, 6); vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) @@ -3633,9 +3823,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (enable_ept) + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + } return &vmx->vcpu; @@ -3699,6 +3893,34 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +static const struct trace_print_flags vmx_exit_reasons_str[] = { + { EXIT_REASON_EXCEPTION_NMI, "exception" }, + { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, + { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, + { EXIT_REASON_NMI_WINDOW, "nmi_window" }, + { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, + { EXIT_REASON_CR_ACCESS, "cr_access" }, + { EXIT_REASON_DR_ACCESS, "dr_access" }, + { EXIT_REASON_CPUID, "cpuid" }, + { EXIT_REASON_MSR_READ, "rdmsr" }, + { EXIT_REASON_MSR_WRITE, "wrmsr" }, + { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, + { EXIT_REASON_HLT, "halt" }, + { EXIT_REASON_INVLPG, "invlpg" }, + { EXIT_REASON_VMCALL, "hypercall" }, + { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, + { EXIT_REASON_APIC_ACCESS, "apic_access" }, + { EXIT_REASON_WBINVD, "wbinvd" }, + { EXIT_REASON_TASK_SWITCH, "task_switch" }, + { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + { -1, NULL } +}; + +static bool vmx_gb_page_enable(void) +{ + return false; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3758,6 +3980,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, + + .exit_reasons_str = vmx_exit_reasons_str, + .gb_page_enable = vmx_gb_page_enable, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 633ccc7400a..be451ee4424 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,11 +37,16 @@ #include <linux/iommu.h> #include <linux/intel-iommu.h> #include <linux/cpufreq.h> +#include <trace/events/kvm.h> +#undef TRACE_INCLUDE_FILE +#define CREATE_TRACE_POINTS +#include "trace.h" #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> +#include <asm/mce.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -55,6 +60,10 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -68,14 +77,16 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); +int ignore_msrs = 0; +module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -122,18 +133,16 @@ unsigned long segment_base(u16 selector) if (selector == 0) return 0; - asm("sgdt %0" : "=m"(gdt)); + kvm_get_gdt(&gdt); table_base = gdt.base; if (selector & 4) { /* from ldt */ - u16 ldt_selector; + u16 ldt_selector = kvm_read_ldt(); - asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct desc_struct *)(table_base + (selector & ~7)); - v = d->base0 | ((unsigned long)d->base1 << 16) | - ((unsigned long)d->base2 << 24); + v = get_desc_base(d); #ifdef CONFIG_X86_64 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; @@ -176,16 +185,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, ++vcpu->stat.pf_guest; if (vcpu->arch.exception.pending) { - if (vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - } else if (vcpu->arch.exception.nr == DF_VECTOR) { + switch(vcpu->arch.exception.nr) { + case DF_VECTOR: /* triple fault -> shutdown */ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + case PF_VECTOR: + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + return; + default: + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + vcpu->arch.exception.pending = false; + break; } - return; } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); @@ -207,12 +222,18 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -static void __queue_exception(struct kvm_vcpu *vcpu) +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return true; + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return false; } +EXPORT_SYMBOL_GPL(kvm_require_cpl); /* * Load the pae pdptrs. Return true is they are all valid. @@ -232,7 +253,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_pte(pdpte[i]) && + if (is_present_gpte(pdpte[i]) && (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) { ret = 0; goto out; @@ -241,6 +262,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) ret = 1; memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); out: return ret; @@ -256,6 +281,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return true; + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; @@ -328,9 +357,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); - KVMTRACE_1D(LMSW, vcpu, - (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), - handler); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -466,7 +492,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; @@ -644,8 +670,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, - &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); local_irq_restore(flags); @@ -778,23 +803,60 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + + switch (msr) { + case MSR_IA32_MCG_STATUS: + vcpu->arch.mcg_status = data; + break; + case MSR_IA32_MCG_CTL: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL */ + if ((offset & 0x3) == 0 && + data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: set_efer(vcpu, data); break; - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __func__, data); + case MSR_K7_HWCR: + data &= ~(u64)0x40; /* ignore flush filter disable */ + if (data != 0) { + pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); + return 1; + } break; - case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __func__, data); + case MSR_FAM10H_MMIO_CONF_BASE: + if (data != 0) { + pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); + return 1; + } break; - case MSR_IA32_MCG_CTL: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", - __func__, data); + case MSR_AMD64_NB_CFG: break; case MSR_IA32_DEBUGCTLMSR: if (!data) { @@ -811,12 +873,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: kvm_set_apic_base(vcpu, data); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -850,9 +915,50 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_request_guest_time_update(vcpu); break; } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); + + /* Performance counters are not protected by a CPUID bit, + * so we should check all of them in the generic path for the sake of + * cross vendor migration. + * Writing a zero into the event select MSRs disables them, + * which we perfectly emulate ;-). Any other value should be at least + * reported, some guests depend on them. + */ + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + if (data != 0) + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + /* at least RHEL 4 unconditionally writes to the perfctr registers, + * so we ignore writes to make it happy. + */ + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; default: - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); + return 1; + } else { + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); + break; + } } return 0; } @@ -905,26 +1011,47 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: + data = 0; + break; case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; case MSR_IA32_MCG_CTL: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_MC0_MISC+20: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case MSR_IA32_PLATFORM_ID: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -932,10 +1059,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: + case MSR_K8_SYSCFG: + case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: + case MSR_K8_INT_PENDING_MSG: + case MSR_AMD64_NB_CFG: + case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; case MSR_MTRRcap: @@ -949,6 +1084,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; + case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + return kvm_x2apic_msr_read(vcpu, msr, pdata); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -967,9 +1105,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME: data = vcpu->arch.time; break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); default: - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); - return 1; + if (!ignore_msrs) { + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } else { + pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + data = 0; + } + break; } *pdata = data; return 0; @@ -1068,6 +1219,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: + case KVM_CAP_IRQFD: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1088,6 +1244,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_MCE: + r = KVM_MAX_MCE_BANKS; + break; default: r = 0; break; @@ -1147,6 +1306,16 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_X86_GET_MCE_CAP_SUPPORTED: { + u64 mce_cap; + + mce_cap = KVM_MCE_CAP_SUPPORTED; + r = -EFAULT; + if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + goto out; + r = 0; + break; + } default: r = -EINVAL; } @@ -1227,6 +1396,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); r = 0; + kvm_apic_set_version(vcpu); out_free: vfree(cpuid_entries); @@ -1248,6 +1418,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); return 0; out: @@ -1290,6 +1461,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; + unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 unsigned f_lm = F(LM); #else @@ -1314,7 +1486,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = @@ -1323,7 +1495,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = @@ -1344,6 +1516,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 1: entry->edx &= kvm_supported_word0_x86_features; entry->ecx &= kvm_supported_word4_x86_features; + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); break; /* function 2 entries are STATEFUL. That is, repeated cpuid commands * may return different values. This forces us to get_cpu() before @@ -1435,6 +1610,10 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + r = -EFAULT; if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) @@ -1464,6 +1643,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); vcpu_put(vcpu); return 0; @@ -1503,6 +1683,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + int r; + unsigned bank_num = mcg_cap & 0xff, bank; + + r = -EINVAL; + if (!bank_num) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + r = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(u64)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(u64)0; +out: + return r; +} + +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, + struct kvm_x86_mce *mce) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u64 *banks = vcpu->arch.mce_banks; + + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) + return -EINVAL; + /* + * if IA32_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && + vcpu->arch.mcg_ctl != ~(u64)0) + return 0; + banks += 4 * mce->bank; + /* + * if IA32_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) + return 0; + if (mce->status & MCI_STATUS_UC) { + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || + !(vcpu->arch.cr4 & X86_CR4_MCE)) { + printk(KERN_DEBUG "kvm: set_mce: " + "injects mce exception while " + "previous one is in progress!\n"); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return 0; + } + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + banks[1] = mce->status; + kvm_queue_exception(vcpu, MC_VECTOR); + } else if (!(banks[1] & MCI_STATUS_VAL) + || !(banks[1] & MCI_STATUS_UC)) { + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + banks[1] = mce->status; + } else + banks[1] |= MCI_STATUS_OVER; + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1636,6 +1890,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + break; + } default: r = -EINVAL; } @@ -1654,6 +1926,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) return ret; } +static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, + u64 ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return 0; +} + static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { @@ -1775,19 +2054,25 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: + mutex_lock(&kvm->irq_lock); memcpy(ioapic_irqchip(kvm), &chip->chip.ioapic, sizeof(struct kvm_ioapic_state)); + mutex_unlock(&kvm->irq_lock); break; default: r = -EINVAL; @@ -1801,7 +2086,9 @@ static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -1809,8 +2096,39 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int r = 0; + mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count); + kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0, start = 0; + u32 prev_legacy, cur_legacy; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, + sizeof(kvm->arch.vpit->pit_state.channels)); + kvm->arch.vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -1819,7 +2137,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, { if (!kvm->arch.vpit) return -ENXIO; + mutex_lock(&kvm->arch.vpit->pit_state.lock); kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); return 0; } @@ -1845,7 +2165,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); memslot = &kvm->memslots[log->slot]; n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; memset(memslot->dirty_bitmap, 0, n); @@ -1869,7 +2188,9 @@ long kvm_arch_vm_ioctl(struct file *filp, */ union { struct kvm_pit_state ps; + struct kvm_pit_state2 ps2; struct kvm_memory_alias alias; + struct kvm_pit_config pit_config; } u; switch (ioctl) { @@ -1878,6 +2199,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r < 0) goto out; break; + case KVM_SET_IDENTITY_MAP_ADDR: { + u64 ident_addr; + + r = -EFAULT; + if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + goto out; + r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); + if (r < 0) + goto out; + break; + } case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; struct kvm_userspace_memory_region kvm_userspace_mem; @@ -1930,16 +2262,24 @@ long kvm_arch_vm_ioctl(struct file *filp, } break; case KVM_CREATE_PIT: - mutex_lock(&kvm->lock); + u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; + goto create_pit; + case KVM_CREATE_PIT2: + r = -EFAULT; + if (copy_from_user(&u.pit_config, argp, + sizeof(struct kvm_pit_config))) + goto out; + create_pit: + down_write(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; r = -ENOMEM; - kvm->arch.vpit = kvm_create_pit(kvm); + kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->lock); + up_write(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -1950,10 +2290,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -2042,6 +2382,32 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_PIT2: { + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT2: { + r = -EFAULT; + if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + if (r) + goto out; + r = 0; + break; + } case KVM_REINJECT_CONTROL: { struct kvm_reinject_control control; r = -EFAULT; @@ -2075,35 +2441,23 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; } -/* - * Only apic need an MMIO device hook, so shortcut now.. - */ -static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, + const void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - if (vcpu->arch.apic) { - dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr, len, is_write)) - return dev; - } - return NULL; + return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); } - -static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) +static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - struct kvm_io_device *dev; + if (vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) + return 0; - dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); - if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, - is_write); - return dev; + return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, @@ -2172,11 +2526,12 @@ static int emulator_read_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, + vcpu->mmio_phys_addr, *(u64 *)val); vcpu->mmio_read_completed = 0; return X86EMUL_CONTINUE; } @@ -2197,14 +2552,12 @@ mmio: /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); - if (mmio_dev) { - kvm_iodevice_read(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); + if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); return X86EMUL_CONTINUE; } - mutex_unlock(&vcpu->kvm->lock); + + trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2231,7 +2584,6 @@ static int emulator_write_emulated_onepage(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - struct kvm_io_device *mmio_dev; gpa_t gpa; gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); @@ -2249,17 +2601,12 @@ static int emulator_write_emulated_onepage(unsigned long addr, return X86EMUL_CONTINUE; mmio: + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); - if (mmio_dev) { - kvm_iodevice_write(mmio_dev, gpa, bytes, val); - mutex_unlock(&vcpu->kvm->lock); + if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) return X86EMUL_CONTINUE; - } - mutex_unlock(&vcpu->kvm->lock); vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; @@ -2343,7 +2690,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2420,7 +2766,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* - * TODO: fix x86_emulate.c to use guest_read/write_register + * TODO: fix emulate.c to use guest_read/write_register * instead of direct ->regs accesses, can save hundred cycles * on Intel for instructions that don't read/change RSP, for * for example. @@ -2444,14 +2790,33 @@ int emulate_instruction(struct kvm_vcpu *vcpu, r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - /* Reject the instructions other than VMCALL/VMMCALL when - * try to emulate invalid opcode */ + /* Only allow emulation of specific instructions on #UD + * (namely VMMCALL, sysenter, sysexit, syscall)*/ c = &vcpu->arch.emulate_ctxt.decode; - if ((emulation_type & EMULTYPE_TRAP_UD) && - (!(c->twobyte && c->b == 0x01 && - (c->modrm_reg == 0 || c->modrm_reg == 3) && - c->modrm_mod == 3 && c->modrm_rm == 1))) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_TRAP_UD) { + if (!c->twobyte) + return EMULATE_FAIL; + switch (c->b) { + case 0x01: /* VMMCALL */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return EMULATE_FAIL; + break; + case 0x34: /* sysenter */ + case 0x35: /* sysexit */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + case 0x05: /* syscall */ + if (c->modrm_mod != 0 || c->modrm_rm != 0) + return EMULATE_FAIL; + break; + default: + return EMULATE_FAIL; + } + + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) + return EMULATE_FAIL; + } ++vcpu->stat.insn_emulation; if (r) { @@ -2571,52 +2936,40 @@ int complete_pio(struct kvm_vcpu *vcpu) return 0; } -static void kernel_pio(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu, - void *pd) +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) { /* TODO: String I/O for in kernel device */ + int r; - mutex_lock(&vcpu->kvm->lock); if (vcpu->arch.pio.in) - kvm_iodevice_read(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); + r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); else - kvm_iodevice_write(pio_dev, vcpu->arch.pio.port, - vcpu->arch.pio.size, - pd); - mutex_unlock(&vcpu->kvm->lock); + r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + return r; } -static void pio_string_write(struct kvm_io_device *pio_dev, - struct kvm_vcpu *vcpu) +static int pio_string_write(struct kvm_vcpu *vcpu) { struct kvm_pio_request *io = &vcpu->arch.pio; void *pd = vcpu->arch.pio_data; - int i; + int i, r = 0; - mutex_lock(&vcpu->kvm->lock); for (i = 0; i < io->cur_count; i++) { - kvm_iodevice_write(pio_dev, io->port, - io->size, - pd); + if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + io->port, io->size, pd)) { + r = -EOPNOTSUPP; + break; + } pd += io->size; } - mutex_unlock(&vcpu->kvm->lock); -} - -static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr, int len, - int is_write) -{ - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); + return r; } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { - struct kvm_io_device *pio_dev; unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; @@ -2630,19 +2983,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, 1); val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); - if (pio_dev) { - kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { complete_pio(vcpu); return 1; } @@ -2656,7 +3003,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, { unsigned now, in_page; int ret = 0; - struct kvm_io_device *pio_dev; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2669,12 +3015,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, count); if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2704,9 +3046,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_gva = address; - pio_dev = vcpu_find_pio_dev(vcpu, port, - vcpu->arch.pio.cur_count, - !vcpu->arch.pio.in); if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); @@ -2714,16 +3053,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_inject_gp(vcpu, 0); return 1; } - if (ret == 0 && pio_dev) { - pio_string_write(pio_dev, vcpu); + if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) ret = 1; } - } else if (pio_dev) - pr_unimpl(vcpu, "no string pio read support yet, " - "port %x size %d count %ld\n", - port, size, count); + } + /* no string PIO read support yet */ return ret; } @@ -2756,10 +3092,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = kvm->vcpus[i]; - if (!vcpu) - continue; + kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; if (!kvm_request_guest_time_update(vcpu)) @@ -2852,7 +3185,6 @@ void kvm_arch_exit(void) int kvm_emulate_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; @@ -2883,7 +3215,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); + trace_kvm_hypercall(nr, a0, a1, a2, a3); if (!is_long_mode(vcpu)) { nr &= 0xFFFFFFFF; @@ -2893,6 +3225,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } + if (kvm_x86_ops->get_cpl(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; @@ -2904,6 +3241,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } +out: kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; @@ -2983,8 +3321,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, - (u32)((u64)value >> 32), handler); return value; } @@ -2992,9 +3328,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, - (u32)((u64)val >> 32), handler); - switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); @@ -3104,11 +3437,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } kvm_x86_ops->skip_emulated_instruction(vcpu); - KVMTRACE_5D(CPUID, vcpu, function, - (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -3174,6 +3507,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; + if (!vcpu->arch.apic) + return; + if (!vcpu->arch.apic->vapic_addr) max_irr = kvm_lapic_find_highest_irr(vcpu); else @@ -3187,12 +3523,16 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - kvm_x86_ops->set_interrupt_shadow(vcpu, 0); - /* try to reinject previous events if any */ + if (vcpu->arch.exception.pending) { + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); + return; + } + if (vcpu->arch.nmi_injected) { kvm_x86_ops->set_nmi(vcpu); return; @@ -3266,16 +3606,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) smp_mb__after_clear_bit(); if (vcpu->requests || need_resched() || signal_pending(current)) { + set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (vcpu->arch.exception.pending) - __queue_exception(vcpu); - else - inject_pending_irq(vcpu, kvm_run); + inject_pending_event(vcpu, kvm_run); /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) @@ -3292,14 +3630,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); - get_debugreg(vcpu->arch.host_dr6, 6); - get_debugreg(vcpu->arch.host_dr7, 7); if (unlikely(vcpu->arch.switch_db_regs)) { - get_debugreg(vcpu->arch.host_db[0], 0); - get_debugreg(vcpu->arch.host_db[1], 1); - get_debugreg(vcpu->arch.host_db[2], 2); - get_debugreg(vcpu->arch.host_db[3], 3); - set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -3307,18 +3638,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.eff_db[3], 3); } - KVMTRACE_0D(VMENTRY, vcpu, entryexit); + trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.host_db[0], 0); - set_debugreg(vcpu->arch.host_db[1], 1); - set_debugreg(vcpu->arch.host_db[2], 2); - set_debugreg(vcpu->arch.host_db[3], 3); + if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); } - set_debugreg(vcpu->arch.host_dr6, 6); - set_debugreg(vcpu->arch.host_dr7, 7); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); @@ -3648,11 +3978,8 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, struct kvm_segment *kvm_desct) { - kvm_desct->base = seg_desc->base0; - kvm_desct->base |= seg_desc->base1 << 16; - kvm_desct->base |= seg_desc->base2 << 24; - kvm_desct->limit = seg_desc->limit0; - kvm_desct->limit |= seg_desc->limit << 16; + kvm_desct->base = get_desc_base(seg_desc); + kvm_desct->limit = get_desc_limit(seg_desc); if (seg_desc->g) { kvm_desct->limit <<= 12; kvm_desct->limit |= 0xfff; @@ -3696,7 +4023,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3706,16 +4032,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3723,19 +4046,13 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, struct desc_struct *seg_desc) { - u32 base_addr; - - base_addr = seg_desc->base0; - base_addr |= (seg_desc->base1 << 16); - base_addr |= (seg_desc->base2 << 24); + u32 base_addr = get_desc_base(seg_desc); return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } @@ -3780,12 +4097,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se return 0; } +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) && + (seg != VCPU_SREG_TR) && + (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; - if (!(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; @@ -4024,7 +4348,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { + if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } @@ -4094,13 +4418,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; - - down_read(&vcpu->kvm->slots_lock); - if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT)) - vcpu->arch.cr3 = sregs->cr3; - else - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - up_read(&vcpu->kvm->slots_lock); + vcpu->arch.cr3 = sregs->cr3; kvm_set_cr8(vcpu, sregs->cr8); @@ -4142,8 +4460,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + update_cr8_intercept(vcpu); + /* Older userspace won't unhalt the vcpu on reset. */ - if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !(vcpu->arch.cr0 & X86_CR0_PE)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -4414,7 +4734,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -4436,6 +4756,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_mmu_destroy; } + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL); + if (!vcpu->arch.mce_banks) { + r = -ENOMEM; + goto fail_mmu_destroy; + } + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return 0; fail_mmu_destroy: @@ -4483,20 +4811,22 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; + struct kvm_vcpu *vcpu; /* * Unpin any mmu pages first. */ - for (i = 0; i < KVM_MAX_VCPUS; ++i) - if (kvm->vcpus[i]) - kvm_unload_vcpu_mmu(kvm->vcpus[i]); - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_unload_vcpu_mmu(vcpu); + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + atomic_set(&kvm->online_vcpus, 0); + mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) @@ -4573,7 +4903,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm, kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); return 0; } @@ -4587,8 +4916,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)); } void kvm_vcpu_kick(struct kvm_vcpu *vcpu) @@ -4612,3 +4943,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { return kvm_x86_ops->interrupt_allowed(vcpu); } + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e..5eadea585d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) { return (nr == BP_VECTOR) || (nr == OF_VECTOR); } + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 07c31899c9c..9e609206fac 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -9,6 +9,8 @@ lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +obj-y += msr-reg.o msr-reg-export.o + ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o lib-y += checksum_32.o diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c new file mode 100644 index 00000000000..a311cc59b65 --- /dev/null +++ b/arch/x86/lib/msr-reg-export.c @@ -0,0 +1,5 @@ +#include <linux/module.h> +#include <asm/msr.h> + +EXPORT_SYMBOL(native_rdmsr_safe_regs); +EXPORT_SYMBOL(native_wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S new file mode 100644 index 00000000000..69fa10623f2 --- /dev/null +++ b/arch/x86/lib/msr-reg.S @@ -0,0 +1,102 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/msr.h> + +#ifdef CONFIG_X86_64 +/* + * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * + * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] + * + */ +.macro op_safe_regs op +ENTRY(native_\op\()_safe_regs) + CFI_STARTPROC + pushq_cfi %rbx + pushq_cfi %rbp + movq %rdi, %r10 /* Save pointer */ + xorl %r11d, %r11d /* Return value */ + movl (%rdi), %eax + movl 4(%rdi), %ecx + movl 8(%rdi), %edx + movl 12(%rdi), %ebx + movl 20(%rdi), %ebp + movl 24(%rdi), %esi + movl 28(%rdi), %edi + CFI_REMEMBER_STATE +1: \op +2: movl %eax, (%r10) + movl %r11d, %eax /* Return value */ + movl %ecx, 4(%r10) + movl %edx, 8(%r10) + movl %ebx, 12(%r10) + movl %ebp, 20(%r10) + movl %esi, 24(%r10) + movl %edi, 28(%r10) + popq_cfi %rbp + popq_cfi %rbx + ret +3: + CFI_RESTORE_STATE + movl $-EIO, %r11d + jmp 2b + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC +ENDPROC(native_\op\()_safe_regs) +.endm + +#else /* X86_32 */ + +.macro op_safe_regs op +ENTRY(native_\op\()_safe_regs) + CFI_STARTPROC + pushl_cfi %ebx + pushl_cfi %ebp + pushl_cfi %esi + pushl_cfi %edi + pushl_cfi $0 /* Return value */ + pushl_cfi %eax + movl 4(%eax), %ecx + movl 8(%eax), %edx + movl 12(%eax), %ebx + movl 20(%eax), %ebp + movl 24(%eax), %esi + movl 28(%eax), %edi + movl (%eax), %eax + CFI_REMEMBER_STATE +1: \op +2: pushl_cfi %eax + movl 4(%esp), %eax + popl_cfi (%eax) + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + movl %ecx, 4(%eax) + movl %edx, 8(%eax) + movl %ebx, 12(%eax) + movl %ebp, 20(%eax) + movl %esi, 24(%eax) + movl %edi, 28(%eax) + popl_cfi %eax + popl_cfi %edi + popl_cfi %esi + popl_cfi %ebp + popl_cfi %ebx + ret +3: + CFI_RESTORE_STATE + movl $-EIO, 4(%esp) + jmp 2b + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC +ENDPROC(native_\op\()_safe_regs) +.endm + +#endif + +op_safe_regs rdmsr +op_safe_regs wrmsr + diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index caa24aca811..33a1e3ca22d 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -175,3 +175,52 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +struct msr_regs_info { + u32 *regs; + int err; +}; + +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index eefdeee8a87..9b5a9f59a47 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,9 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o gup.o + pat.o pgtable.o physaddr.o gup.o + +# Make sure __phys_addr has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_physaddr.o := $(nostackp) obj-$(CONFIG_SMP) += tlb.o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bfae139182f..775a020990a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -285,26 +285,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address, tsk->thread.screen_bitmap |= 1 << bit; } -static void dump_pagetable(unsigned long address) +static bool low_pfn(unsigned long pfn) { - __typeof__(pte_val(__pte(0))) page; + return pfn < max_low_pfn; +} - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); + printk("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; #endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* * We must not directly access the pte in the highpte @@ -312,16 +311,12 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + pte = pte_offset_kernel(pmd, address); + printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: printk("\n"); } @@ -450,16 +445,12 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *pgd; + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - - pgd += pgd_index(address); if (bad_address(pgd)) goto bad; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed55e7e..63a6ba66cbe 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -24,7 +24,7 @@ void kunmap(struct page *page) * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * - * However when holding an atomic kmap is is not legal to sleep, so atomic + * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) @@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) { diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2a76eba9da2..334e63ca7b2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -22,77 +22,7 @@ #include <asm/pgalloc.h> #include <asm/pat.h> -static inline int phys_addr_valid(resource_size_t addr) -{ -#ifdef CONFIG_PHYS_ADDR_T_64BIT - return !(addr >> boot_cpu_data.x86_phys_bits); -#else - return 1; -#endif -} - -#ifdef CONFIG_X86_64 - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; - } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); - } - return x; -} -EXPORT_SYMBOL(__phys_addr); - -bool __virt_addr_valid(unsigned long x) -{ - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) - return false; - x += phys_base; - } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) - return false; - } - - return pfn_valid(x >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#else - -#ifdef CONFIG_DEBUG_VIRTUAL -unsigned long __phys_addr(unsigned long x) -{ - /* VMALLOC_* aren't constants */ - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); -#endif - -bool __virt_addr_valid(unsigned long x) -{ - if (x < PAGE_OFFSET) - return false; - if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) - return false; - if (x >= FIXADDR_START) - return false; - return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); -} -EXPORT_SYMBOL(__virt_addr_valid); - -#endif +#include "physaddr.h" int page_is_ram(unsigned long pagenr) { diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index 2c55ed09865..528bf954eb7 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -331,6 +331,20 @@ static void kmemcheck_read_strict(struct pt_regs *regs, kmemcheck_shadow_set(shadow, size); } +bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) +{ + enum kmemcheck_shadow status; + void *shadow; + + shadow = kmemcheck_shadow_lookup(addr); + if (!shadow) + return true; + + status = kmemcheck_shadow_test(shadow, size); + + return status == KMEMCHECK_SHADOW_INITIALIZED; +} + /* Access may cross page boundary */ static void kmemcheck_read(struct pt_regs *regs, unsigned long addr, unsigned int size) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e245775ec85..24952fdc7e4 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> +#include <linux/percpu.h> #include <asm/e820.h> #include <asm/processor.h> @@ -686,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -744,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 9b647f67938..7257cf3decf 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -994,7 +994,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations memtype_seq_ops = { +static const struct seq_operations memtype_seq_ops = { .start = memtype_seq_start, .next = memtype_seq_next, .stop = memtype_seq_stop, diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c new file mode 100644 index 00000000000..d2e2735327b --- /dev/null +++ b/arch/x86/mm/physaddr.c @@ -0,0 +1,70 @@ +#include <linux/mmdebug.h> +#include <linux/module.h> +#include <linux/mm.h> + +#include <asm/page.h> + +#include "physaddr.h" + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(!phys_addr_valid(x)); + } + return x; +} +EXPORT_SYMBOL(__phys_addr); + +bool __virt_addr_valid(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (!phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#else + +#ifdef CONFIG_DEBUG_VIRTUAL +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); +#endif + +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) + return false; + if (x >= FIXADDR_START) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h new file mode 100644 index 00000000000..a3cd5a0c97b --- /dev/null +++ b/arch/x86/mm/physaddr.h @@ -0,0 +1,10 @@ +#include <asm/processor.h> + +static inline int phys_addr_valid(resource_size_t addr) +{ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + return !(addr >> boot_cpu_data.x86_phys_bits); +#else + return 1; +#endif +} diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 29a0e37114f..6f8aa33031c 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -215,7 +215,7 @@ int __init get_memcfg_from_srat(void) goto out_fail; if (num_memory_chunks == 0) { - printk(KERN_WARNING + printk(KERN_DEBUG "could not find any ACPI SRAT memory areas.\n"); goto out_fail; } @@ -277,7 +277,7 @@ int __init get_memcfg_from_srat(void) } return 1; out_fail: - printk(KERN_ERR "failed to get NUMA memory information from SRAT" + printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" " table\n"); return 0; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 89b9a5cd63d..cb88b1a0bd5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,11 +1,14 @@ /** * @file nmi_int.c * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Robert Richter <robert.richter@amd.com> + * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Jason Yeh <jason.yeh@amd.com> + * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> */ #include <linux/init.h> @@ -24,13 +27,35 @@ #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const *model; +static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +struct op_counter_config counter_config[OP_MAX_COUNTER]; + +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (event & 0x0F00) << 24; + + return val; +} + + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -52,36 +77,214 @@ static int profile_exceptions_notify(struct notifier_block *self, static void nmi_cpu_save_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); + } + + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); + } +} + +static void nmi_cpu_start(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->start(msrs); +} + +static int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 1); + return 0; +} + +static void nmi_cpu_stop(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->stop(msrs); +} + +static void nmi_stop(void) +{ + on_each_cpu(nmi_cpu_stop, NULL, 1); +} + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static DEFINE_PER_CPU(int, switch_index); + +static inline int has_mux(void) +{ + return !!model->switch_ctrl; +} + +inline int op_x86_phys_to_virt(int phys) +{ + return __get_cpu_var(switch_index) + phys; +} + +inline int op_x86_virt_to_phys(int virt) +{ + return virt % model->num_counters; +} + +static void nmi_shutdown_mux(void) +{ + int i; + + if (!has_mux()) + return; + + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; + per_cpu(switch_index, i) = 0; } +} + +static int nmi_setup_mux(void) +{ + size_t multiplex_size = + sizeof(struct op_msr) * model->num_virt_counters; + int i; + + if (!has_mux()) + return 1; + + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) + return 0; + } + + return 1; +} + +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + if (!has_mux()) + return; - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; } } + + per_cpu(switch_index, cpu) = 0; +} + +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + } } -static void nmi_save_registers(void *dummy) +static void nmi_cpu_switch(void *dummy) { int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - nmi_cpu_save_registers(msrs); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!has_mux()) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + return 0; +} + +static inline void mux_init(struct oprofile_operations *ops) +{ + if (has_mux()) + ops->switch_events = nmi_switch_event; +} + +static void mux_clone(int cpu) +{ + if (!has_mux()) + return; + + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); } +#else + +inline int op_x86_phys_to_virt(int phys) { return phys; } +inline int op_x86_virt_to_phys(int virt) { return virt; } +static inline void nmi_shutdown_mux(void) { } +static inline int nmi_setup_mux(void) { return 1; } +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } +static inline void mux_init(struct oprofile_operations *ops) { } +static void mux_clone(int cpu) { } + +#endif + static void free_msrs(void) { int i; @@ -95,38 +298,32 @@ static void free_msrs(void) static int allocate_msrs(void) { - int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).counters) + return 0; per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).controls) + return 0; } - if (!success) - free_msrs(); - - return success; + return 1; } static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); - model->setup_ctrs(msrs); + model->setup_ctrs(model, msrs); + nmi_cpu_setup_mux(cpu, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -144,11 +341,15 @@ static int nmi_setup(void) int cpu; if (!allocate_msrs()) - return -ENOMEM; + err = -ENOMEM; + else if (!nmi_setup_mux()) + err = -ENOMEM; + else + err = register_die_notifier(&profile_exceptions_nb); - err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + nmi_shutdown_mux(); return err; } @@ -159,45 +360,38 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); for_each_possible_cpu(cpu) { - if (cpu != 0) { - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - } + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + mux_clone(cpu); } - on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + for (i = 0; i < model->num_controls; ++i) { + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); } - for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + for (i = 0; i < model->num_counters; ++i) { + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); } } @@ -205,7 +399,7 @@ static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on @@ -216,7 +410,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); } static void nmi_shutdown(void) @@ -226,42 +420,18 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + nmi_shutdown_mux(); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); } -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); -} - -static int nmi_start(void) -{ - on_each_cpu(nmi_cpu_start, NULL, 1); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); -} - -static void nmi_stop(void) -{ - on_each_cpu(nmi_cpu_stop, NULL, 1); -} - -struct op_counter_config counter_config[OP_MAX_COUNTER]; - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; - for (i = 0; i < model->num_counters; ++i) { + for (i = 0; i < model->num_virt_counters; ++i) { struct dentry *dir; char buf[4]; @@ -270,7 +440,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). */ - if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) + if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) continue; snprintf(buf, sizeof(buf), "%d", i); @@ -402,6 +572,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; @@ -428,7 +599,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - arch_perfmon_setup_counters(); + spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -439,17 +610,7 @@ static int __init ppro_init(char **cpu_type) return 0; } - model = &op_ppro_spec; - return 1; -} - -static int __init arch_perfmon_init(char **cpu_type) -{ - if (!cpu_has_arch_perfmon) - return 0; - *cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - arch_perfmon_setup_counters(); + model = spec; return 1; } @@ -471,27 +632,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { - default: - return -ENODEV; case 6: - model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_amd_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; + default: + return -ENODEV; } + model = &op_amd_spec; break; case X86_VENDOR_INTEL: @@ -510,8 +670,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) break; } - if (!cpu_type && !arch_perfmon_init(&cpu_type)) + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; break; default: @@ -522,18 +689,23 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; if (model->init) ret = model->init(ops); if (ret) return ret; + if (!model->num_virt_counters) + model->num_virt_counters = model->num_counters; + + mux_init(ops); + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 91b6a116165..e28398df0df 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf..39686c29f03 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -9,12 +9,15 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter <robert.richter@amd.com> - * @author Barry Kasindorf + * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Jason Yeh <jason.yeh@amd.com> + * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> */ #include <linux/oprofile.h> #include <linux/device.h> #include <linux/pci.h> +#include <linux/percpu.h> #include <asm/ptrace.h> #include <asm/msr.h> @@ -25,43 +28,36 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_CONTROLS 32 +#else +#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_CONTROLS NUM_CONTROLS +#endif + +#define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) + +static unsigned long reset_value[NUM_VIRT_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL /*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 static int has_ibs; /* AMD Family10h and later */ @@ -78,6 +74,45 @@ static struct op_ibs_config ibs_config; #endif +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = op_x86_virt_to_phys(i); + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +} + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#else + +static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } + +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -97,150 +132,174 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) else msrs->controls[i].addr = 0; } -} + op_mux_fill_in_addresses(msrs); +} -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; + /* setup reset_value */ + for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + if (counter_config[i].enabled) + reset_value[i] = counter_config[i].count; + else + reset_value[i] = 0; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < NUM_CONTROLS; ++i) { + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; - CTR_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + if (!msrs->counters[i].addr) + continue; - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } #ifdef CONFIG_OPROFILE_IBS -static inline int +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - u32 low, high; - u64 msr; + u64 val, ctl; struct op_entry entry; if (!has_ibs) - return 1; + return; if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - oprofile_add_data(&entry, low); - oprofile_add_data(&entry, high); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSOPRIP, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA2, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA3, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCLINAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data64(&entry, val); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; + ctl |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } +} - return 1; +static inline void op_amd_start_ibs(void) +{ + u64 val; + if (has_ibs && ibs_config.fetch_enabled) { + val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); + } + + if (has_ibs && ibs_config.op_enabled) { + val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, val); + } +} + +static void op_amd_stop_ibs(void) +{ + if (has_ibs && ibs_config.fetch_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); + + if (has_ibs && ibs_config.op_enabled) + /* clear max count and enable */ + wrmsrl(MSR_AMD64_IBSOPCTL, 0); } +#else + +static inline void op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { } +static inline void op_amd_start_ibs(void) { } +static inline void op_amd_stop_ibs(void) { } + #endif static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, virt); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); } -#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); -#endif /* See op_model_ppro.c */ return 1; @@ -248,79 +307,50 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, static void op_amd_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } - if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_start_ibs(); } - static void op_amd_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!reset_value[op_x86_phys_to_virt(i)]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } - if (has_ibs && ibs_config.op_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_stop_ibs(); } static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < NUM_COUNTERS; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = 0; i < NUM_CONTROLS; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } @@ -490,15 +520,21 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ -struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, +struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_virt_counters = NUM_VIRT_COUNTERS, + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .shutdown = &op_amd_shutdown, +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + .switch_ctrl = &op_mux_switch_ctrl, +#endif }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd75..ac6b354becd 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -32,6 +32,8 @@ #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) +#define OP_CTR_OVERFLOW (1ULL<<31) + static unsigned int num_counters = NUM_COUNTERS_NON_HT; static unsigned int num_controls = NUM_CONTROLS_NON_HT; @@ -350,8 +352,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) -#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -361,17 +361,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) -#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) -#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) - /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" @@ -515,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ - ESCR_READ(escr, high, ev, i); + rdmsr(ev->bindings[i].escr_address, escr, high); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); @@ -526,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - ESCR_WRITE(escr, high, ev, i); + wrmsr(ev->bindings[i].escr_address, escr, high); /* modify CCCR */ - CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); @@ -537,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) CCCR_SET_PMI_OVF_0(cccr); else CCCR_SET_PMI_OVF_1(cccr); - CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); return; } } @@ -548,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } -static void p4_setup_ctrs(struct op_msrs const * const msrs) +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; @@ -563,8 +558,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) } /* clear the cccrs we will use */ - for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < num_counters; i++) { + if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -574,17 +569,18 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ - for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { + for (i = 0; i < num_counters; ++i) { + if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(u64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -624,14 +620,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, real = VIRT_CTR(stag, i); - CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); - if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); CCCR_CLEAR_OVF(low); - CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsrl(p4_counters[real].counter_address, + -(u64)reset_value[i]); } } @@ -653,9 +651,9 @@ static void p4_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_ENABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -670,9 +668,9 @@ static void p4_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_DISABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -680,8 +678,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } /* @@ -689,15 +687,15 @@ static void p4_shutdown(struct op_msrs const * const msrs) * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ - for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = num_counters; i < num_controls; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } } #ifdef CONFIG_SMP -struct op_x86_model_spec const op_p4_ht2_spec = { +struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, @@ -709,7 +707,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = { }; #endif -struct op_x86_model_spec const op_p4_spec = { +struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4da7230b3d1..4899215999d 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -10,6 +10,7 @@ * @author Philippe Elie * @author Graydon Hoare * @author Andi Kleen + * @author Robert Richter <robert.richter@amd.com> */ #include <linux/oprofile.h> @@ -18,7 +19,6 @@ #include <asm/msr.h> #include <asm/apic.h> #include <asm/nmi.h> -#include <asm/perf_counter.h> #include "op_x86_model.h" #include "op_counter.h" @@ -26,20 +26,7 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT(val, e) (val |= e) +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -63,9 +50,10 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) } -static void ppro_setup_ctrs(struct op_msrs const * const msrs) +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) { @@ -93,36 +81,30 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) } /* clear all counters */ - for (i = 0 ; i < num_counters; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + for (i = 0; i < num_counters; ++i) { + if (unlikely(!msrs->controls[i].addr)) continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < num_counters; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -143,14 +125,14 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (unlikely(!reset_value)) goto out; - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); - if (CTR_OVERFLOWED(val)) { - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } out: @@ -171,16 +153,16 @@ out: static void ppro_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } } @@ -188,7 +170,7 @@ static void ppro_start(struct op_msrs const * const msrs) static void ppro_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) @@ -196,9 +178,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } @@ -206,12 +188,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < num_counters ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + for (i = 0; i < num_counters; ++i) { + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { @@ -222,8 +204,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, /* can be overriden */ - .num_controls = 2, /* dito */ + .num_counters = 2, + .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -241,7 +224,7 @@ struct op_x86_model_spec op_ppro_spec = { * the specific CPU. */ -void arch_perfmon_setup_counters(void) +static void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; @@ -259,11 +242,17 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; - op_ppro_spec.num_counters = num_counters; - op_ppro_spec.num_controls = num_counters; +} + +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; } struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, + .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, /* user space does the cpuid check for available events */ diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d6..b83776180c7 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -6,51 +6,66 @@ * @remark Read the file COPYING * * @author Graydon Hoare + * @author Robert Richter <robert.richter@amd.com> */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -struct op_saved_msr { - unsigned int high; - unsigned int low; -}; +#include <asm/types.h> +#include <asm/perf_counter.h> struct op_msr { - unsigned long addr; - struct op_saved_msr saved; + unsigned long addr; + u64 saved; }; struct op_msrs { struct op_msr *counters; struct op_msr *controls; + struct op_msr *multiplex; }; struct pt_regs; +struct oprofile_operations; + /* The model vtable abstracts the differences between * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - unsigned int num_counters; - unsigned int num_controls; - void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); + unsigned int num_counters; + unsigned int num_controls; + unsigned int num_virt_counters; + u64 reserved; + u16 event_mask; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + void (*switch_ctrl)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); +#endif }; +struct op_counter_config; + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); +extern int op_x86_phys_to_virt(int phys); +extern int op_x86_virt_to_phys(int virt); + extern struct op_x86_model_spec op_ppro_spec; -extern struct op_x86_model_spec const op_p4_spec; -extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_p4_spec; +extern struct op_x86_model_spec op_p4_ht2_spec; +extern struct op_x86_model_spec op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; -extern void arch_perfmon_setup_counters(void); - #endif /* OP_X86_MODEL_H */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 3ffa10df20b..572ee9782f2 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -15,63 +15,6 @@ * also get peer root bus resource for io,mmio */ -#ifdef CONFIG_NUMA - -#define BUS_NR 256 - -#ifdef CONFIG_X86_64 - -static int mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node = -1; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return node; - - node = mp_bus_to_node[busnum]; - - /* - * let numa_node_id to decide it later in dma_alloc_pages - * if there is no ram on that node - */ - if (node != -1 && !node_online(node)) - node = -1; - - return node; -} - -#else /* CONFIG_X86_32 */ - -static unsigned char mp_bus_to_node[BUS_NR]; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = (unsigned char) node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return 0; - node = mp_bus_to_node[busnum]; - return node; -} - -#endif /* CONFIG_X86_32 */ - -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_X86_64 /* @@ -301,11 +244,6 @@ static int __init early_fill_mp_bus_info(void) u64 val; u32 address; -#ifdef CONFIG_NUMA - for (i = 0; i < BUS_NR; i++) - mp_bus_to_node[i] = -1; -#endif - if (!early_pci_allowed()) return -1; @@ -346,7 +284,7 @@ static int __init early_fill_mp_bus_info(void) node = (reg >> 4) & 0x07; #ifdef CONFIG_NUMA for (j = min_bus; j <= max_bus; j++) - mp_bus_to_node[j] = (unsigned char) node; + set_mp_bus_to_node(j, node); #endif link = (reg >> 8) & 0x03; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2202b6257b8..5db96d4304d 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -600,3 +600,72 @@ struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) { return pci_scan_bus_on_node(busno, &pci_root_ops, -1); } + +/* + * NUMA info for PCI busses + * + * Early arch code is responsible for filling in reasonable values here. + * A node id of "-1" means "use current node". In other words, if a bus + * has a -1 node id, it's not tightly coupled to any particular chunk + * of memory (as is the case on some Nehalem systems). + */ +#ifdef CONFIG_NUMA + +#define BUS_NR 256 + +#ifdef CONFIG_X86_64 + +static int mp_bus_to_node[BUS_NR] = { + [0 ... BUS_NR - 1] = -1 +}; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node = -1; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return node; + + node = mp_bus_to_node[busnum]; + + /* + * let numa_node_id to decide it later in dma_alloc_pages + * if there is no ram on that node + */ + if (node != -1 && !node_online(node)) + node = -1; + + return node; +} + +#else /* CONFIG_X86_32 */ + +static unsigned char mp_bus_to_node[BUS_NR] = { + [0 ... BUS_NR - 1] = -1 +}; + +void set_mp_bus_to_node(int busnum, int node) +{ + if (busnum >= 0 && busnum < BUS_NR) + mp_bus_to_node[busnum] = (unsigned char) node; +} + +int get_mp_bus_to_node(int busnum) +{ + int node; + + if (busnum < 0 || busnum > (BUS_NR - 1)) + return 0; + node = mp_bus_to_node[busnum]; + return node; +} + +#endif /* CONFIG_X86_32 */ + +#endif /* CONFIG_NUMA */ diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index bd13c3e4c6d..347d882b3bb 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -192,13 +192,14 @@ struct pci_raw_ops pci_direct_conf2 = { static int __init pci_sanity_check(struct pci_raw_ops *o) { u32 x = 0; - int devfn; + int year, devfn; if (pci_probe & PCI_NO_CHECKS) return 1; /* Assume Type 1 works for newer systems. This handles machines that don't have anything on PCI Bus 0. */ - if (dmi_get_year(DMI_BIOS_DATE) >= 2001) + dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL); + if (year >= 2001) return 1; for (devfn = 0; devfn < 0x100; devfn++) { diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 7410640db17..3bb4fc21f4f 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -8,6 +8,7 @@ endif # Make sure early boot has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_enlighten.o := $(nostackp) +CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ @@ -16,3 +17,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o + diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index eb33aaa8415..0dd0c2c6cae 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -51,6 +51,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> +#include <asm/stackprotector.h> #include "xen-ops.h" #include "mmu.h" @@ -330,18 +331,28 @@ static void xen_load_gdt(const struct desc_ptr *dtr) unsigned long frames[pages]; int f; - /* A GDT can be up to 64k in size, which corresponds to 8192 - 8-byte entries, or 16 4k pages.. */ + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ BUG_ON(size > 65536); BUG_ON(va & ~PAGE_MASK); for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { int level; - pte_t *ptep = lookup_address(va, &level); + pte_t *ptep; unsigned long pfn, mfn; void *virt; + /* + * The GDT is per-cpu and is in the percpu data area. + * That can be virtually mapped, so we need to do a + * page-walk to get the underlying MFN for the + * hypercall. The page can also be in the kernel's + * linear range, so we need to RO that mapping too. + */ + ptep = lookup_address(va, &level); BUG_ON(ptep == NULL); pfn = pte_pfn(*ptep); @@ -358,6 +369,44 @@ static void xen_load_gdt(const struct desc_ptr *dtr) BUG(); } +/* + * load_gdt for early boot, when the gdt is only mapped once + */ +static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) +{ + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; + int f; + + /* + * A GDT can be up to 64k in size, which corresponds to 8192 + * 8-byte entries, or 16 4k pages.. + */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + pte_t pte; + unsigned long pfn, mfn; + + pfn = virt_to_pfn(va); + mfn = pfn_to_mfn(pfn); + + pte = pfn_pte(pfn, PAGE_KERNEL_RO); + + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) + BUG(); + + frames[f] = mfn; + } + + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { @@ -581,6 +630,29 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, preempt_enable(); } +/* + * Version of write_gdt_entry for use at early boot-time needed to + * update an entry as simply as possible. + */ +static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + switch (type) { + case DESC_LDT: + case DESC_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) + dt[entry] = *(struct desc_struct *)desc; + } + + } +} + static void xen_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -714,7 +786,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) set: base = ((u64)high << 32) | low; if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EFAULT; + ret = -EIO; break; #endif @@ -965,6 +1037,23 @@ static const struct machine_ops __initdata xen_machine_ops = { .emergency_restart = xen_emergency_restart, }; +/* + * Set up the GDT and segment registers for -fstack-protector. Until + * we do this, we have to be careful not to call any stack-protected + * function, which is most of the kernel. + */ +static void __init xen_setup_stackprotector(void) +{ + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; + pv_cpu_ops.load_gdt = xen_load_gdt_boot; + + setup_stack_canary_segment(0); + switch_to_new_gdt(0); + + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; + pv_cpu_ops.load_gdt = xen_load_gdt; +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -983,13 +1072,28 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; -#ifdef CONFIG_X86_64 /* - * Setup percpu state. We only need to do this for 64-bit - * because 32-bit already has %fs set properly. + * Set up some pagetable state before starting to set any ptes. */ - load_percpu_segment(0); -#endif + + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + __supported_pte_mask |= _PAGE_IOMAP; + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_build_dynamic_phys_to_machine(); + + /* + * Set up kernel GDT and segment registers, mainly so that + * -fstack-protector code can be executed. + */ + xen_setup_stackprotector(); xen_init_irq_ops(); xen_init_cpuid_mask(); @@ -1001,8 +1105,6 @@ asmlinkage void __init xen_start_kernel(void) set_xen_basic_apic_ops(); #endif - xen_setup_features(); - if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; @@ -1019,17 +1121,8 @@ asmlinkage void __init xen_start_kernel(void) xen_smp_init(); - /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_build_dynamic_phys_to_machine(); - pgd = (pgd_t *)xen_start_info->pt_base; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!xen_initial_domain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - #ifdef CONFIG_X86_64 /* Work out if we support NX */ check_efer(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 429834ec168..fe03eeed7b4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -236,6 +236,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #else ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 5601506f2dd..36a5141108d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -187,7 +187,6 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; - unsigned long flags; u64 start; /* If kicker interrupts not initialized yet, just spin */ @@ -199,16 +198,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl /* announce we're spinning */ prev = spinning_lock(xl); - flags = __raw_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } - ADD_STATS(taken_slow, 1); ADD_STATS(taken_slow_nested, prev != NULL); do { + unsigned long flags; + /* clear pending */ xen_clear_irq_pending(irq); @@ -228,6 +223,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl goto out; } + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + /* * Block until irq becomes pending. If we're * interrupted at this point (after the trylock but @@ -238,13 +239,15 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl * pending. */ xen_poll_irq(irq); + + raw_local_irq_restore(flags); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: - raw_local_irq_restore(flags); unspinning_lock(xl, prev); spin_time_accum_blocked(start); @@ -323,8 +326,13 @@ static void xen_spin_unlock(struct raw_spinlock *lock) smp_wmb(); /* make sure no writes get moved after unlock */ xl->lock = 0; /* release lock */ - /* make sure unlock happens before kick */ - barrier(); + /* + * Make sure unlock happens before checking for waiting + * spinners. We need a strong barrier to enforce the + * write-read ordering to different memory locations, as the + * CPU makes no implied guarantees about their ordering. + */ + mb(); if (unlikely(xl->spinners)) xen_spin_unlock_slow(xl); |