diff options
Diffstat (limited to 'arch/x86')
45 files changed, 713 insertions, 689 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7109037bdf7..59eef1c7fda 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,8 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_OPROFILE + select HAVE_KPROBES config GENERIC_LOCKBREAK def_bool n @@ -106,10 +108,6 @@ config GENERIC_TIME_VSYSCALL config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 -config ARCH_SUPPORTS_OPROFILE - bool - default y - select HAVE_KVM config ARCH_HIBERNATION_POSSIBLE @@ -204,8 +202,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also the <file:Documentation/smp.txt>, - <file:Documentation/i386/IO-APIC.txt>, + See also <file:Documentation/i386/IO-APIC.txt>, <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at <http://www.tldp.org/docs.html#howto>. @@ -309,6 +306,7 @@ config X86_RDC321X select M486 select X86_REBOOTFIXUPS select GENERIC_GPIO + select LEDS_CLASS select LEDS_GPIO help This option is needed for RDC R-321x system-on-chip, also known @@ -1597,8 +1595,6 @@ source "drivers/firmware/Kconfig" source "fs/Kconfig" -source "kernel/Kconfig.instrumentation" - source "arch/x86/Kconfig.debug" source "security/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8978e98bed5..364865b1b08 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -92,7 +92,6 @@ KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) LDFLAGS := -m elf_$(UTS_MACHINE) -OBJCOPYFLAGS := -O binary -R .note -R .comment -S # Speed up the build KBUILD_CFLAGS += -pipe diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 349b81a39c4..f88458e83ef 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA #RAMDISK := -DRAMDISK=512 targets := vmlinux.bin setup.bin setup.elf zImage bzImage -subdir- := compressed +subdir- := compressed setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o @@ -43,9 +43,17 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs-y := tools/build +hostprogs-y := mkcpustr tools/build -HOSTCFLAGS_build.o := $(LINUXINCLUDE) +HOST_EXTRACFLAGS += $(LINUXINCLUDE) + +$(obj)/cpu.o: $(obj)/cpustr.h + +quiet_cmd_cpustr = CPUSTR $@ + cmd_cpustr = $(obj)/mkcpustr > $@ +targets += cpustr.h +$(obj)/cpustr.h: $(obj)/mkcpustr FORCE + $(call if_changed,cpustr) # --------------------------------------------------------------------------- @@ -80,6 +88,7 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ $(call if_changed,image) @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) @@ -90,7 +99,6 @@ $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_setup.bin := -O binary - $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) @@ -98,7 +106,7 @@ $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel -FDARGS = +FDARGS = # Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel FDINITRD = diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fe24ceabd90..d2b9f3bb87c 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -22,6 +22,7 @@ $(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $ $(call if_changed,ld) @: +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1ccb38a7f0d..e8657b98c90 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -80,8 +80,8 @@ startup_32: #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(LARGE_PAGE_SIZE -1), %ebx - andl $LARGE_PAGE_MASK, %ebx + addl $(PMD_PAGE_SIZE -1), %ebx + andl $PMD_PAGE_MASK, %ebx #else movl $CONFIG_PHYSICAL_START, %ebx #endif @@ -220,8 +220,8 @@ ENTRY(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(LARGE_PAGE_SIZE - 1), %rbp - andq $LARGE_PAGE_MASK, %rbp + addq $(PMD_PAGE_SIZE - 1), %rbp + andq $PMD_PAGE_MASK, %rbp movq %rbp, %rbx #else movq $CONFIG_PHYSICAL_START, %rbp diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 2a5c32da585..00e19edd852 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -1,7 +1,7 @@ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -9,7 +9,7 @@ * ----------------------------------------------------------------------- */ /* - * arch/i386/boot/cpu.c + * arch/x86/boot/cpu.c * * Check for obligatory CPU features and abort if the features are not * present. @@ -19,6 +19,8 @@ #include "bitops.h" #include <asm/cpufeature.h> +#include "cpustr.h" + static char *cpu_name(int level) { static char buf[6]; @@ -35,6 +37,7 @@ int validate_cpu(void) { u32 *err_flags; int cpu_level, req_level; + const unsigned char *msg_strs; check_cpu(&cpu_level, &req_level, &err_flags); @@ -51,13 +54,26 @@ int validate_cpu(void) puts("This kernel requires the following features " "not present on the CPU:\n"); + msg_strs = (const unsigned char *)x86_cap_strs; + for (i = 0; i < NCAPINTS; i++) { u32 e = err_flags[i]; for (j = 0; j < 32; j++) { - if (e & 1) - printf("%d:%d ", i, j); - + int n = (i << 5)+j; + if (*msg_strs < n) { + /* Skip to the next string */ + do { + msg_strs++; + } while (*msg_strs); + msg_strs++; + } + if (e & 1) { + if (*msg_strs == n && msg_strs[1]) + printf("%s ", msg_strs+1); + else + printf("%d:%d ", i, j); + } e >>= 1; } } diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c new file mode 100644 index 00000000000..bbe76953bae --- /dev/null +++ b/arch/x86/boot/mkcpustr.c @@ -0,0 +1,49 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2008 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * This is a host program to preprocess the CPU strings into a + * compact format suitable for the setup code. + */ + +#include <stdio.h> + +#include "../kernel/cpu/feature_names.c" + +#if NCAPFLAGS > 8 +# error "Need to adjust the boot code handling of CPUID strings" +#endif + +int main(void) +{ + int i; + const char *str; + + printf("static const char x86_cap_strs[] = \n"); + + for (i = 0; i < NCAPINTS*32; i++) { + str = x86_cap_flags[i]; + + if (i == NCAPINTS*32-1) { + /* The last entry must be unconditional; this + also consumes the compiler-added null character */ + if (!str) + str = ""; + printf("\t\"\\x%02x\"\"%s\"\n", i, str); + } else if (str) { + printf("#if REQUIRED_MASK%d & (1 << %d)\n" + "\t\"\\x%02x\"\"%s\\0\"\n" + "#endif\n", + i >> 5, i & 31, i, str); + } + } + printf("\t;\n"); + return 0; +} diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6f813009d44..21dc1a061bf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -37,7 +37,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o @@ -74,7 +75,8 @@ ifdef CONFIG_INPUT_PCSPKR obj-y += pcspeaker.o endif -obj-$(CONFIG_SCx200) += scx200_32.o +obj-$(CONFIG_SCx200) += scx200.o +scx200-y += scx200_32.o ### # 64 bit specific files diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfdb2f3bd76..a0c4d7c5dbd 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,6 +3,7 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y += feature_names.o obj-$(CONFIG_X86_32) += common.o proc.o bugs.o obj-$(CONFIG_X86_32) += amd.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b7b2142b58e..d9313d9adce 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -623,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); - void __init early_cpu_init(void) { intel_cpu_init(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ad6527a5beb..e0b38c33d84 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -27,3 +27,12 @@ extern void display_cacheinfo(struct cpuinfo_x86 *c); extern void early_init_intel(struct cpuinfo_x86 *c); extern void early_init_amd(struct cpuinfo_x86 *c); +/* Specific CPU type init functions */ +int intel_cpu_init(void); +int amd_init_cpu(void); +int cyrix_init_cpu(void); +int nsc_init_cpu(void); +int centaur_init_cpu(void); +int transmeta_init_cpu(void); +int nexgen_init_cpu(void); +int umc_init_cpu(void); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c new file mode 100644 index 00000000000..ee975ac6bbc --- /dev/null +++ b/arch/x86/kernel/cpu/feature_names.c @@ -0,0 +1,83 @@ +/* + * Strings for the various x86 capability flags. + * + * This file must not contain any executable code. + */ + +#include "asm/cpufeature.h" + +/* + * These flag bits must match the definitions in <asm/cpufeature.h>. + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu/<cpu_nr>/cpuid instead. + */ +const char * const x86_cap_flags[NCAPINTS*32] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + + /* AMD-defined */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, NULL, + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", "svm", "extapic", + "cr8_legacy", "abm", "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", "sse5", + "skinit", "wdt", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d1c372b018d..fae31ce747b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/ds.h> +#include <asm/bugs.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 8e139c70f88..ff14c320040 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -7,8 +7,6 @@ #include <asm/processor-flags.h> #include "mtrr.h" -int arr3_protected; - static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) @@ -99,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 4: return replace_reg; case 3: - if (arr3_protected) - break; case 2: case 1: case 0: @@ -115,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); - if ((i == 3) && arr3_protected) - continue; if (lsize == 0) return i; } @@ -260,107 +254,6 @@ static void cyrix_set_all(void) post_set(); } -#if 0 -/* - * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection - * with the SMM (System Management Mode) mode. So we need the following: - * Check whether SMI_LOCK (CCR3 bit 0) is set - * if it is set, write a warning message: ARR3 cannot be changed! - * (it cannot be changed until the next processor reset) - * if it is reset, then we can change it, set all the needed bits: - * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) - * - disable access to SMM memory (CCR1 bit 2 reset) - * - disable SMM mode (CCR1 bit 1 reset) - * - disable write protection of ARR3 (CCR6 bit 1 reset) - * - (maybe) disable ARR3 - * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) - */ -static void __init -cyrix_arr_init(void) -{ - struct set_mtrr_context ctxt; - unsigned char ccr[7]; - int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; -#ifdef CONFIG_SMP - int i; -#endif - - /* flush cache and enable MAPEN */ - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - /* Save all CCRs locally */ - ccr[0] = getCx86(CX86_CCR0); - ccr[1] = getCx86(CX86_CCR1); - ccr[2] = getCx86(CX86_CCR2); - ccr[3] = ctxt.ccr3; - ccr[4] = getCx86(CX86_CCR4); - ccr[5] = getCx86(CX86_CCR5); - ccr[6] = getCx86(CX86_CCR6); - - if (ccr[3] & 1) { - ccrc[3] = 1; - arr3_protected = 1; - } else { - /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and - * access to SMM memory through ARR3 (bit 7). - */ - if (ccr[1] & 0x80) { - ccr[1] &= 0x7f; - ccrc[1] |= 0x80; - } - if (ccr[1] & 0x04) { - ccr[1] &= 0xfb; - ccrc[1] |= 0x04; - } - if (ccr[1] & 0x02) { - ccr[1] &= 0xfd; - ccrc[1] |= 0x02; - } - arr3_protected = 0; - if (ccr[6] & 0x02) { - ccr[6] &= 0xfd; - ccrc[6] = 1; /* Disable write protection of ARR3 */ - setCx86(CX86_CCR6, ccr[6]); - } - /* Disable ARR3. This is safe now that we disabled SMM. */ - /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ - } - /* If we changed CCR1 in memory, change it in the processor, too. */ - if (ccrc[1]) - setCx86(CX86_CCR1, ccr[1]); - - /* Enable ARR usage by the processor */ - if (!(ccr[5] & 0x20)) { - ccr[5] |= 0x20; - ccrc[5] = 1; - setCx86(CX86_CCR5, ccr[5]); - } -#ifdef CONFIG_SMP - for (i = 0; i < 7; i++) - ccr_state[i] = ccr[i]; - for (i = 0; i < 8; i++) - cyrix_get_arr(i, - &arr_state[i].base, &arr_state[i].size, - &arr_state[i].type); -#endif - - set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */ - - if (ccrc[5]) - printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n"); - if (ccrc[3]) - printk(KERN_INFO "mtrr: ARR3 cannot be changed\n"); -/* - if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); - if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); - if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); -*/ - if (ccrc[6]) - printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n"); -} -#endif - static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, // .init = cyrix_arr_init, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 71591958265..1e27b69a7a0 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -#ifndef CONFIG_X86_64 -extern int arr3_protected; -#else -#define arr3_protected 0 -#endif - void set_mtrr_ops(struct mtrr_ops * ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) @@ -513,12 +507,6 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } - if (is_cpu(CYRIX) && !use_intel()) { - if ((reg == 3) && arr3_protected) { - printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); - goto out; - } - } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); @@ -566,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del); * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ -extern void amd_init_mtrr(void); -extern void cyrix_init_mtrr(void); -extern void centaur_init_mtrr(void); - static void __init init_ifs(void) { #ifndef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index fb74a2c2081..2cc77eb6fea 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -97,3 +97,7 @@ void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); +/* CPU specific mtrr init functions */ +int amd_init_mtrr(void); +int cyrix_init_mtrr(void); +int centaur_init_mtrr(void); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 02821326014..af11d31dce0 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -10,80 +10,6 @@ */ static int show_cpuinfo(struct seq_file *m, void *v) { - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static const char * const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char * const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* constant_tsc - moved to flags */ - /* nothing */ - }; struct cpuinfo_x86 *c = v; int i, n = 0; int fpu_exception; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a63432d800f..288e7a6598a 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,6 +17,10 @@ * and then read in chunks of 16 bytes. A larger size means multiple * reads of consecutive levels. * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on * an SMP box will direct the access to CPU %d. */ @@ -43,35 +47,24 @@ static struct class *cpuid_class; -struct cpuid_command { - u32 reg; - u32 *data; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; }; static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = cmd_block; - - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], - &cmd->data[3]); -} - -static inline void do_cpuid(int cpu, u32 reg, u32 * data) -{ - struct cpuid_command cmd; - - cmd.reg = reg; - cmd.data = data; + struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + cpuid_count(cmd->eax, cmd->ecx, + &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); } static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) { loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); - + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -84,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) default: ret = -EINVAL; } - - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } @@ -93,19 +85,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) { char __user *tmp = buf; - u32 data[4]; - u32 reg = *ppos; + struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); + u64 pos = *ppos; if (count % 16) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 16) { - do_cpuid(cpu, reg, data); - if (copy_to_user(tmp, &data, 16)) + cmd.eax = pos; + cmd.ecx = pos >> 32; + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; - *ppos = reg++; + *ppos = ++pos; } return tmp - buf; @@ -193,7 +187,7 @@ static int __init cpuid_init(void) } for_each_online_cpu(i) { err = cpuid_device_create(i); - if (err != 0) + if (err != 0) goto out_class; } register_hotcpu_notifier(&cpuid_class_cpu_notifier); @@ -208,7 +202,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); out: return err; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1411324a625..32dd62b36ff 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -379,11 +379,9 @@ void __init efi_init(void) #endif } -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - unsigned long end; void *p; if (!(__supported_pte_mask & _PAGE_NX)) @@ -392,18 +390,13 @@ static void __init runtime_code_page_mkexec(void) /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if (md->type == EFI_RUNTIME_SERVICES_CODE && - (end >> PAGE_SHIFT) <= max_pfn_mapped) { - set_memory_x(md->virt_addr, md->num_pages); - set_memory_uc(md->virt_addr, md->num_pages); - } + + if (md->type != EFI_RUNTIME_SERVICES_CODE) + continue; + + set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT); } - __flush_tlb_all(); } -#else -static inline void __init runtime_code_page_mkexec(void) { } -#endif /* * This function will switch the EFI runtime services to virtual mode. @@ -417,30 +410,40 @@ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md; efi_status_t status; - unsigned long end; - void *p; + unsigned long size; + u64 end, systab; + void *p, *va; efi.systab = NULL; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if ((md->attribute & EFI_MEMORY_WB) && - ((end >> PAGE_SHIFT) <= max_pfn_mapped)) - md->virt_addr = (unsigned long)__va(md->phys_addr); + + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + + if ((end >> PAGE_SHIFT) <= max_pfn_mapped) + va = __va(md->phys_addr); else - md->virt_addr = (unsigned long) - efi_ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!md->virt_addr) + va = efi_ioremap(md->phys_addr, size); + + if (md->attribute & EFI_MEMORY_WB) + set_memory_uc(md->virt_addr, size); + + md->virt_addr = (u64) (unsigned long) va; + + if (!va) { printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", (unsigned long long)md->phys_addr); - if ((md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < end)) - efi.systab = (efi_system_table_t *)(unsigned long) - (md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab); + continue; + } + + systab = (u64) (unsigned long) efi_phys.systab; + if (md->phys_addr <= systab && systab < end) { + systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *) (unsigned long) systab; + } } BUG_ON(!efi.systab); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 674f2379480..09d5c233093 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -54,10 +54,10 @@ static void __init early_mapping_set_exec(unsigned long start, else set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ __supported_pte_mask)); - if (level == 4) - start = (start + PMD_SIZE) & PMD_MASK; - else + if (level == PG_LEVEL_4K) start = (start + PAGE_SIZE) & PAGE_MASK; + else + start = (start + PMD_SIZE) & PMD_MASK; } } @@ -109,23 +109,23 @@ void __init efi_reserve_bootmem(void) memmap.nr_map * memmap.desc_size); } -void __iomem * __init efi_ioremap(unsigned long offset, - unsigned long size) +void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped; - unsigned long last_addr; unsigned i, pages; - last_addr = offset + size - 1; - offset &= PAGE_MASK; - pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + /* phys_addr and size must be page aligned */ + if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return NULL; + + pages = size >> PAGE_SHIFT; if (pages_mapped + pages > MAX_EFI_IO_PAGES) return NULL; for (i = 0; i < pages; i++) { __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - offset, PAGE_KERNEL_EXEC_NOCACHE); - offset += PAGE_SIZE; + phys_addr, PAGE_KERNEL); + phys_addr += PAGE_SIZE; pages_mapped++; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d5a7a36120..4f283ad215e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -63,7 +63,7 @@ startup_64: /* Is the address not 2M aligned? */ movq %rbp, %rax - andl $~LARGE_PAGE_MASK, %eax + andl $~PMD_PAGE_MASK, %eax testl %eax, %eax jnz bad_address @@ -88,7 +88,7 @@ startup_64: /* Add an Identity mapping if I am above 1G */ leaq _text(%rip), %rdi - andq $LARGE_PAGE_MASK, %rdi + andq $PMD_PAGE_MASK, %rdi movq %rdi, %rax shrq $PUD_SHIFT, %rax diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8a7660c8394..0224c3637c7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -35,7 +35,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount + 511) & (~511); + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index bd82850e651..af51ea8400b 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,9 +45,10 @@ static struct class *msr_class; static loff_t msr_seek(struct file *file, loff_t offset, int orig) { - loff_t ret = -EINVAL; + loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) case 1: file->f_pos += offset; ret = file->f_pos; + break; + default: + ret = -EINVAL; } - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 4d5cc718198..845cbecd68e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -501,7 +501,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) } a = aper + iommu_size; - iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { printk(KERN_WARNING @@ -731,7 +731,8 @@ void __init gart_iommu_init(void) * the backing memory. The GART address is only used by PCI * devices. */ - clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); /* * Try to workaround a bug (thanks to BenH) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 968371ab223..dabdbeff1f7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -251,7 +251,7 @@ void cpu_idle_wait(void) * because it has nothing to do. * Give all the remaining CPUS a kick. */ - smp_call_function_mask(map, do_nothing, 0, 0); + smp_call_function_mask(map, do_nothing, NULL, 0); } while (!cpus_empty(map)); set_cpus_allowed(current, tmp); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 18df70c534b..c8939dfddfb 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -1068,82 +1068,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) struct cpuinfo_x86 *c = v; int cpu = 0, i; - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static const char *const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char *const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ - }; - - #ifdef CONFIG_SMP cpu = c->cpu_index; #endif diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index ae0ef2e304c..36c100c323a 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/sort.h> #include <asm/uaccess.h> +#include <asm/asm.h> extern int rodata_test_data; @@ -89,16 +90,7 @@ static noinline int test_address(void *address) "2: mov %[zero], %[rslt]\n" " ret\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" -#ifdef CONFIG_X86_32 - " .long 0b\n" - " .long 2b\n" -#else - " .quad 0b\n" - " .quad 2b\n" -#endif - ".previous\n" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) ); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 9bcc1c6aca3..64580679861 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -11,12 +11,7 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * - * We jump into the boot/compressed/head.S code. So you'd - * better be running a compressed kernel image or you - * won't get very far. + * We jump into arch/x86/kernel/head_32.S. * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index e30b67c6a9f..4aedd0bcee4 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -10,9 +10,6 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, data addresses need to be absolute diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 4525bc2c2e1..12affe1f9bc 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -220,21 +220,21 @@ static void vmi_set_tr(void) static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { u32 *idt_entry = (u32 *)g; - vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]); } static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, const void *desc, int type) { u32 *gdt_entry = (u32 *)desc; - vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]); } static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c83e1c9b512..41962e793c0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -53,5 +53,6 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig +source drivers/virtio/Kconfig endif # VIRTUALIZATION diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c index afd0045595d..b6544045985 100644 --- a/arch/x86/lib/bitops_32.c +++ b/arch/x86/lib/bitops_32.c @@ -2,7 +2,7 @@ #include <linux/module.h> /** - * find_next_bit - find the first set bit in a memory region + * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c index 95b6d9639fb..0e8f491e6cc 100644 --- a/arch/x86/lib/bitops_64.c +++ b/arch/x86/lib/bitops_64.c @@ -58,7 +58,7 @@ long find_first_zero_bit(const unsigned long * addr, unsigned long size) } /** - * find_next_zero_bit - find the first zero bit in a memory region + * find_next_zero_bit - find the next zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at * @size: The maximum size to search diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 28084d2e8dd..cc9b4a4450f 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -4,6 +4,7 @@ #include <linux/hardirq.h> #include <linux/module.h> +#include <asm/asm.h> #include <asm/i387.h> @@ -50,10 +51,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); @@ -81,10 +79,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -181,10 +176,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<(4096-320)/64; i++) @@ -211,10 +203,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -311,10 +300,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<4096/64; i++) @@ -341,10 +327,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9c4ffd5bedb..e849b9998b0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -48,10 +48,7 @@ do { \ "3: movl %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -132,11 +129,8 @@ do { \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - " .long 1b,2b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ } while (0) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 893d43f838c..0c89d1bb028 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -31,10 +31,7 @@ do { \ "3: movq %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -87,11 +84,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,3b\n" - " .quad 1b,2b\n" - ".previous" + _ASM_EXTABLE(0b,3b) + _ASM_EXTABLE(1b,2b) : [size8] "=c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4440d0abf8..ad8b9733d6b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address) pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; + if (!pud_present(*pud) || pud_large(*pud)) + goto ret; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; @@ -508,6 +509,10 @@ static int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + /* Copy kernel mappings over when needed. This can also happen within a race in page table update. In the later case just flush. */ @@ -603,6 +608,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ #ifdef CONFIG_X86_32 if (unlikely(address >= TASK_SIZE)) { +#else + if (unlikely(address >= TASK_SIZE64)) { +#endif if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; @@ -618,6 +626,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; } + +#ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) @@ -630,28 +640,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (in_atomic() || !mm) goto bad_area_nosemaphore; #else /* CONFIG_X86_64 */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) - return; - - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } if (likely(regs->flags & X86_EFLAGS_IF)) local_irq_enable(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f2f36f8dae5..d1bc04006d1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include <linux/initrd.h> #include <linux/cpumask.h> +#include <asm/asm.h> #include <asm/processor.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void) "1: movb %1, %0 \n" " xorl %2, %2 \n" "2: \n" - ".section __ex_table, \"a\"\n" - " .align 4 \n" - " .long 1b, 2b \n" - ".previous \n" + _ASM_EXTABLE(1b,2b) :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eabcaed76c2..3a98d6f724a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long entry; pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { @@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) continue; - entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; - entry &= __supported_pte_mask; - set_pmd(pmd, __pmd(entry)); + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } } @@ -435,49 +433,6 @@ void __init paging_init(void) #endif /* - * Unmap a kernel mapping if it exists. This is useful to avoid - * prefetches from the CPU leading to inconsistent cache lines. - * address and size must be aligned to 2MB boundaries. - * Does nothing when the mapping doesn't exist. - */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) - continue; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - - if (!(pmd_val(*pmd) & _PAGE_PSE)) { - /* - * Could handle this, but it should not happen - * currently: - */ - printk(KERN_ERR "clear_kernel_mapping: " - "mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - -/* * Memory hotplug specific functions */ void online_page(struct page *page) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c004d94608f..ee6648fe6b1 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr) * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ -static int ioremap_change_attr(unsigned long paddr, unsigned long size, +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum ioremap_mode mode) { - unsigned long vaddr = (unsigned long)__va(paddr); unsigned long nrpages = size >> PAGE_SHIFT; - unsigned int level; int err; - /* No change for pages after the last mapping */ - if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) - return 0; - - /* - * If there is no identity map for this address, - * change_page_attr_addr is unnecessary - */ - if (!lookup_address(vaddr, &level)) - return 0; - switch (mode) { case IOR_MODE_UNCACHED: default: @@ -114,9 +101,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size, static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, enum ioremap_mode mode) { - void __iomem *addr; + unsigned long pfn, offset, last_addr, vaddr; struct vm_struct *area; - unsigned long offset, last_addr; pgprot_t prot; /* Don't allow wraparound or zero size */ @@ -133,9 +119,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && - (offset << PAGE_SHIFT) < last_addr; offset++) { - if (page_is_ram(offset)) + for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && + (pfn << PAGE_SHIFT) < last_addr; pfn++) { + if (page_is_ram(pfn) && pfn_valid(pfn) && + !PageReserved(pfn_to_page(pfn))) return NULL; } @@ -163,19 +150,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, if (!area) return NULL; area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, prot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + vaddr = (unsigned long) area->addr; + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + remove_vm_area((void *)(vaddr & PAGE_MASK)); return NULL; } - if (ioremap_change_attr(phys_addr, size, mode) < 0) { - vunmap(addr); + if (ioremap_change_attr(vaddr, size, mode) < 0) { + vunmap(area->addr); return NULL; } - return (void __iomem *) (offset + (char __iomem *)addr); + return (void __iomem *) (vaddr + offset); } /** @@ -254,9 +240,6 @@ void iounmap(volatile void __iomem *addr) return; } - /* Reset the direct mapping. Can block */ - ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); - /* Finally remove it */ o = remove_vm_area((void *)addr); BUG_ON(p != o || o == NULL); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a920d09b919..5a02bf4c91e 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, + nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; @@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); + printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", + bootmap_start, bootmap_start + bootmap_size - 1, + bootmap_pages); + free_bootmem_with_active_regions(nodeid, end); reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7573e786d2f..398f3a578dd 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -137,7 +137,8 @@ static __init int exercise_pageattr(void) for (k = 0; k < len[i]; k++) { pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); - if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { addr[i] = 0; break; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e297bd65e51..bb55a78dcd6 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,17 @@ #include <asm/uaccess.h> #include <asm/pgalloc.h> +/* + * The current flushing context - we pass it instead of 5 arguments: + */ +struct cpa_data { + unsigned long vaddr; + pgprot_t mask_set; + pgprot_t mask_clr; + int numpages; + int flushtlb; +}; + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -52,21 +63,23 @@ void clflush_cache_range(void *vaddr, unsigned int size) static void __cpa_flush_all(void *arg) { + unsigned long cache = (unsigned long)arg; + /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); - if (boot_cpu_data.x86_model >= 4) + if (cache && boot_cpu_data.x86_model >= 4) wbinvd(); } -static void cpa_flush_all(void) +static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_all, NULL, 1, 1); + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); } static void __cpa_flush_range(void *arg) @@ -79,7 +92,7 @@ static void __cpa_flush_range(void *arg) __flush_tlb_all(); } -static void cpa_flush_range(unsigned long start, int numpages) +static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; unsigned long addr; @@ -89,6 +102,9 @@ static void cpa_flush_range(unsigned long start, int numpages) on_each_cpu(__cpa_flush_range, NULL, 1, 1); + if (!cache) + return; + /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that @@ -101,11 +117,27 @@ static void cpa_flush_range(unsigned long start, int numpages) /* * Only flush present addresses: */ - if (pte && pte_present(*pte)) + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range((void *) addr, PAGE_SIZE); } } +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + + +/* + * Converts a virtual address to a X86-64 highmap address + */ +static unsigned long virt_to_highmap(void *address) +{ +#ifdef CONFIG_X86_64 + return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; +#else + return (unsigned long)address; +#endif +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -129,12 +161,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) */ if (within(address, (unsigned long)_text, (unsigned long)_etext)) pgprot_val(forbidden) |= _PAGE_NX; + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) + pgprot_val(forbidden) |= _PAGE_NX; + #ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) pgprot_val(forbidden) |= _PAGE_RW; + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(__start_rodata), + virt_to_highmap(__end_rodata))) + pgprot_val(forbidden) |= _PAGE_RW; #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); @@ -142,6 +186,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) return prot; } +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ pte_t *lookup_address(unsigned long address, int *level) { pgd_t *pgd = pgd_offset_k(address); @@ -152,21 +204,31 @@ pte_t *lookup_address(unsigned long address, int *level) if (pgd_none(*pgd)) return NULL; + pud = pud_offset(pgd, address); if (pud_none(*pud)) return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; *level = PG_LEVEL_2M; - if (pmd_large(*pmd)) + if (pmd_large(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); } +/* + * Set the new pmd in all the pgds we know about: + */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ @@ -175,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) if (!SHARED_KERNEL_PMD) { struct page *page; + address = __pa(address); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pud_t *pud; @@ -189,18 +252,114 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, flags; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int level, do_split = 1; + + /* + * An Athlon 64 X2 showed hard hangs if we tried to preserve + * largepages and changed the PSE entry from RW to RO. + * + * As AMD CPUs have a long series of erratas in this area, + * (and none of the known ones seem to explain this hang), + * disable this code until the hang can be debugged: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return 1; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#ifdef CONFIG_X86_64 + case PG_LEVEL_1G: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#endif + default: + do_split = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = pte_pgprot(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + new_prot = static_protections(new_prot, address); + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + do_split = 0; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, new_pte); + cpa->flushtlb = 1; + do_split = 0; + } + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + return do_split; +} + static int split_large_page(pte_t *kpte, unsigned long address) { - pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + unsigned long flags, pfn, pfninc = 1; gfp_t gfp_flags = GFP_KERNEL; - unsigned long flags; - unsigned long addr; + unsigned int i, level; pte_t *pbase, *tmp; + pgprot_t ref_prot; struct page *base; - unsigned int i, level; #ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; gfp_flags = GFP_ATOMIC | __GFP_NOWARN; #endif base = alloc_pages(gfp_flags, 0); @@ -213,30 +372,41 @@ static int split_large_page(pte_t *kpte, unsigned long address) * up for us already: */ tmp = lookup_address(address, &level); - if (tmp != kpte) { - WARN_ON_ONCE(1); + if (tmp != kpte) goto out_unlock; - } - address = __pa(address); - addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); #ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); #endif + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + } +#endif - pgprot_val(ref_prot) &= ~_PAGE_NX; - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); + /* + * Get the target pfn from the original entry: + */ + pfn = pte_pfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); /* - * Install the new, split up pagetable. Important detail here: + * Install the new, split up pagetable. Important details here: * * On Intel the NX bit of all levels must be cleared to make a * page executable. See section 4.13.2 of Intel 64 and IA-32 * Architectures Software Developer's Manual). + * + * Mark the entry present. The current mapping might be + * set to not present, which we preserved above. */ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + pgprot_val(ref_prot) |= _PAGE_PRESENT; __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); base = NULL; @@ -249,18 +419,12 @@ out_unlock: return 0; } -static int -__change_page_attr(unsigned long address, unsigned long pfn, - pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { + int level, do_split, err; struct page *kpte_page; - int level, err = 0; pte_t *kpte; -#ifdef CONFIG_X86_32 - BUG_ON(pfn > max_low_pfn); -#endif - repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -271,23 +435,62 @@ repeat: BUG_ON(PageCompound(kpte_page)); if (level == PG_LEVEL_4K) { - pgprot_t new_prot = pte_pgprot(*kpte); pte_t new_pte, old_pte = *kpte; + pgprot_t new_prot = pte_pgprot(old_pte); + + if(!pte_val(old_pte)) { + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); + return -EINVAL; + } - pgprot_val(new_prot) &= ~pgprot_val(mask_clr); - pgprot_val(new_prot) |= pgprot_val(mask_set); + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); new_prot = static_protections(new_prot, address); - new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); - BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flushtlb = 1; + } + cpa->numpages = 1; + return 0; + } + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + do_split = try_preserve_large_page(kpte, address, cpa); + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (do_split <= 0) + return do_split; - set_pte_atomic(kpte, new_pte); - } else { - err = split_large_page(kpte, address); - if (!err) - goto repeat; + /* + * We have to split the large page: + */ + err = split_large_page(kpte, address); + if (!err) { + cpa->flushtlb = 1; + goto repeat; } + return err; } @@ -304,19 +507,14 @@ repeat: * * Modules and drivers should use the set_memory_* APIs instead. */ - -#define HIGH_MAP_START __START_KERNEL_map -#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) - -static int -change_page_attr_addr(unsigned long address, pgprot_t mask_set, - pgprot_t mask_clr) +static int change_page_attr_addr(struct cpa_data *cpa) { - unsigned long phys_addr = __pa(address); - unsigned long pfn = phys_addr >> PAGE_SHIFT; int err; + unsigned long address = cpa->vaddr; #ifdef CONFIG_X86_64 + unsigned long phys_addr = __pa(address); + /* * If we are inside the high mapped kernel range, then we * fixup the low mapping first. __va() returns the virtual @@ -326,7 +524,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, address = (unsigned long) __va(phys_addr); #endif - err = __change_page_attr(address, pfn, mask_set, mask_clr); + err = __change_page_attr(address, cpa); if (err) return err; @@ -339,42 +537,89 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, /* * Calc the high mapping address. See __phys_addr() * for the non obvious details. + * + * Note that NX and other required permissions are + * checked in static_protections(). */ address = phys_addr + HIGH_MAP_START - phys_base; - /* Make sure the kernel mappings stay executable */ - pgprot_val(mask_clr) |= _PAGE_NX; /* * Our high aliases are imprecise, because we check * everything between 0 and KERNEL_TEXT_SIZE, so do * not propagate lookup failures back to users: */ - __change_page_attr(address, pfn, mask_set, mask_clr); + __change_page_attr(address, cpa); } #endif return err; } -static int __change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr_set_clr(struct cpa_data *cpa) { - unsigned int i; - int ret; + int ret, numpages = cpa->numpages; - for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { - ret = change_page_attr_addr(addr, mask_set, mask_clr); + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; + ret = change_page_attr_addr(cpa); if (ret) return ret; - } + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { - int ret = __change_page_attr_set_clr(addr, numpages, mask_set, - mask_clr); + struct cpa_data cpa; + int ret, cache; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + return 0; + + cpa.vaddr = addr; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + cpa.flushtlb = 0; + + ret = __change_page_attr_set_clr(&cpa); + + /* + * Check whether we really changed something: + */ + if (!cpa.flushtlb) + return ret; + + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); /* * On success we use clflush, when the CPU supports it to @@ -383,9 +628,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * wbindv): */ if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages); + cpa_flush_range(addr, numpages, cache); else - cpa_flush_all(); + cpa_flush_all(cache); return ret; } @@ -489,37 +734,26 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } - -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) -static inline int __change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); -} - -static inline int __change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); -} -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC static int __set_pages_p(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0)}; - return __change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PRESENT | _PAGE_RW)); + return __change_page_attr_set_clr(&cpa); } static int __set_pages_np(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; - return __change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PRESENT)); + return __change_page_attr_set_clr(&cpa); } void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cb3aa470249..c7db504be1e 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&page->lru); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -#if (PTRS_PER_PMD == 1) -/* Non-PAE pgd constructor */ -static void pgd_ctor(void *pgd) +static void pgd_ctor(void *p) { + pgd_t *pgd = p; unsigned long flags; - /* !PAE, no pagetable sharing */ + /* Clear usermode parts of PGD */ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); - /* must happen under lock */ - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} -#else /* PTRS_PER_PMD > 1 */ -/* PAE pgd constructor */ -static void pgd_ctor(void *pgd) -{ - /* PAE, kernel PMD may be shared */ - - if (SHARED_KERNEL_PMD) { - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + clone_pgd_range(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - } else { - unsigned long flags; + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - } + + spin_unlock_irqrestore(&pgd_lock, flags); } -#endif /* PTRS_PER_PMD */ static void pgd_dtor(void *pgd) { @@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd) spin_unlock_irqrestore(&pgd_lock, flags); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - #ifdef CONFIG_X86_PAE /* * Mop up any pmd pages which may still be attached to the pgd. @@ -387,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - /* This is called just after the pmd has been detached from - the pgd, which requires a full tlb flush to be recognized - by the CPU. Rather than incurring multiple tlb flushes - while the address space is being pulled down, make the tlb - gathering machinery do a full flush when we're done. */ - tlb->fullmm = 1; - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c index f5f165f69e0..55270c26237 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numa.c @@ -5,36 +5,62 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/nodemask.h> +#include <mach_apic.h> #include "pci.h" +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + #define BUS2QUAD(global) (mp_bus_id_to_node[global]) #define BUS2LOCAL(global) (mp_bus_id_to_local[global]) #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) +extern void *xquad_portio; /* Where the IO area was mapped */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) +static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) +{ + unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); + if (xquad_portio) + writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); + else + outl(val, 0xCF8); +} + static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + *value = readb(adr + (reg & 3)); + else + *value = inb(0xCFC + (reg & 3)); break; case 2: - *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + *value = readw(adr + (reg & 2)); + else + *value = inw(0xCFC + (reg & 2)); break; case 4: - *value = inl_quad(0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + *value = readl(adr); + else + *value = inl(0xCFC); break; } @@ -47,23 +73,33 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + writeb(value, adr + (reg & 3)); + else + outb((u8)value, 0xCFC + (reg & 3)); break; case 2: - outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + writew(value, adr + (reg & 2)); + else + outw((u16)value, 0xCFC + (reg & 2)); break; case 4: - outl_quad((u32)value, 0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + writel(value, adr + reg); + else + outl((u32)value, 0xCFC); break; } |