diff options
author | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-02-14 09:46:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-02-14 09:46:06 -0800 |
commit | 414f827c46973ba39320cfb43feb55a0eeb9b4e8 (patch) | |
tree | 45e860974ef698e71370a0ebdddcff4f14fbdf9e /arch | |
parent | 86a71dbd3e81e8870d0f0e56b87875f57e58222b (diff) | |
parent | 126b1922367fbe5513daa675a2abd13ed3917f4e (diff) |
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (94 commits)
[PATCH] x86-64: Remove mk_pte_phys()
[PATCH] i386: Fix broken CONFIG_COMPAT_VDSO on i386
[PATCH] i386: fix 32-bit ioctls on x64_32
[PATCH] x86: Unify pcspeaker platform device code between i386/x86-64
[PATCH] i386: Remove extern declaration from mm/discontig.c, put in header.
[PATCH] i386: Rename cpu_gdt_descr and remove extern declaration from smpboot.c
[PATCH] i386: Move mce_disabled to asm/mce.h
[PATCH] i386: paravirt unhandled fallthrough
[PATCH] x86_64: Wire up compat epoll_pwait
[PATCH] x86: Don't require the vDSO for handling a.out signals
[PATCH] i386: Fix Cyrix MediaGX detection
[PATCH] i386: Fix warning in cpu initialization
[PATCH] i386: Fix warning in microcode.c
[PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs
[PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in /proc/cpuinfo
[PATCH] i386: Remove fastcall in paravirt.[ch]
[PATCH] x86-64: Fix wrong gcc check in bitops.h
[PATCH] x86-64: survive having no irq mapping for a vector
[PATCH] i386: geode configuration fixes
[PATCH] i386: add option to show more code in oops reports
...
Diffstat (limited to 'arch')
86 files changed, 3183 insertions, 838 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 63d5e841caf..595fb771366 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -203,6 +203,15 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +config VMI + bool "VMI Paravirt-ops support" + depends on PARAVIRT + default y + help + VMI provides a paravirtualized interface to multiple hypervisors + include VMware ESX server and Xen by connecting to a ROM module + provided by the hypervisor. + config ACPI_SRAT bool default y @@ -1263,3 +1272,12 @@ config X86_TRAMPOLINE config KTIME_SCALAR bool default y + +config NO_IDLE_HZ + bool + depends on PARAVIRT + default y + help + Switches the regular HZ timer off when the system is going idle. + This helps a hypervisor detect that the Linux system is idle, + reducing the overhead of idle systems. diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index 2aecfba4ac4..b99c0e2a4e6 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -226,11 +226,6 @@ config X86_CMPXCHG depends on !M386 default y -config X86_XADD - bool - depends on !M386 - default y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index f68cc6f215f..458bc161193 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -87,7 +87,7 @@ config DOUBLEFAULT config DEBUG_PARAVIRT bool "Enable some paravirtualization debugging" - default y + default n depends on PARAVIRT && DEBUG_KERNEL help Currently deliberately clobbers regs which are allowed to be diff --git a/arch/i386/defconfig b/arch/i386/defconfig index bb0c376b62b..5ae1e0bc8fd 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-rc3 -# Fri Jan 5 11:54:46 2007 +# Linux kernel version: 2.6.20-git8 +# Tue Feb 13 11:25:18 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -10,6 +10,7 @@ CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y CONFIG_MMU=y +CONFIG_ZONE_DMA=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_GENERIC_BUG=y @@ -139,7 +140,6 @@ CONFIG_MPENTIUMIII=y # CONFIG_MVIAC3_2 is not set CONFIG_X86_GENERIC=y CONFIG_X86_CMPXCHG=y -CONFIG_X86_XADD=y CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_RWSEM_XCHGADD_ALGORITHM=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set @@ -198,6 +198,7 @@ CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y @@ -211,6 +212,7 @@ CONFIG_HZ_250=y CONFIG_HZ=250 # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set +CONFIG_PHYSICAL_START=0x100000 # CONFIG_RELOCATABLE is not set CONFIG_PHYSICAL_ALIGN=0x100000 # CONFIG_HOTPLUG_CPU is not set @@ -229,13 +231,14 @@ CONFIG_PM_SYSFS_DEPRECATED=y # ACPI (Advanced Configuration and Power Interface) Support # CONFIG_ACPI=y +CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set @@ -306,7 +309,6 @@ CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y # CONFIG_PCIEPORTBUS is not set CONFIG_PCI_MSI=y -# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set CONFIG_ISA_DMA_API=y @@ -347,6 +349,7 @@ CONFIG_UNIX=y CONFIG_XFRM=y # CONFIG_XFRM_USER is not set # CONFIG_XFRM_SUB_POLICY is not set +# CONFIG_XFRM_MIGRATE is not set # CONFIG_NET_KEY is not set CONFIG_INET=y CONFIG_IP_MULTICAST=y @@ -446,6 +449,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set # @@ -466,8 +470,7 @@ CONFIG_FW_LOADER=y # # Plug and Play support # -CONFIG_PNP=y -CONFIG_PNPACPI=y +# CONFIG_PNP is not set # # Block devices @@ -515,6 +518,7 @@ CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_TASK_IOCTL is not set # @@ -547,6 +551,7 @@ CONFIG_BLK_DEV_AMD74XX=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -557,6 +562,7 @@ CONFIG_BLK_DEV_PIIX=y # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_BLK_DEV_TC86C001 is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -655,6 +661,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # Serial ATA (prod) and Parallel ATA (experimental) drivers # CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set CONFIG_SATA_AHCI=y CONFIG_SATA_SVW=y CONFIG_ATA_PIIX=y @@ -670,6 +677,7 @@ CONFIG_SATA_SIL=y # CONFIG_SATA_ULI is not set CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set @@ -687,6 +695,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_HPT3X2N is not set # CONFIG_PATA_HPT3X3 is not set # CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set @@ -739,9 +748,7 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set -# CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers @@ -767,6 +774,11 @@ CONFIG_IEEE1394_RAWIO=y # CONFIG_I2O is not set # +# Macintosh device drivers +# +# CONFIG_MAC_EMUMOUSEBTN is not set + +# # Network device support # CONFIG_NETDEVICES=y @@ -833,6 +845,7 @@ CONFIG_8139TOO=y # CONFIG_SUNDANCE is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set # # Ethernet (1000 Mbit) @@ -855,11 +868,13 @@ CONFIG_SKY2=y CONFIG_TIGON3=y CONFIG_BNX2=y # CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set # # Ethernet (10000 Mbit) # # CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set @@ -1090,6 +1105,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y @@ -1103,6 +1119,7 @@ CONFIG_SOUND_ICH=y # HID Devices # CONFIG_HID=y +# CONFIG_HID_DEBUG is not set # # USB support @@ -1117,10 +1134,8 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1130,9 +1145,11 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set +# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_BIG_ENDIAN is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -1183,6 +1200,7 @@ CONFIG_USB_HID=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set +# CONFIG_USB_GTCO is not set # # USB Imaging devices @@ -1288,6 +1306,10 @@ CONFIG_USB_MON=y # # +# Auxiliary Display support +# + +# # Virtualization # # CONFIG_KVM is not set @@ -1480,6 +1502,7 @@ CONFIG_UNUSED_SYMBOLS=y # CONFIG_DEBUG_FS is not set # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set @@ -1488,7 +1511,6 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set @@ -1533,7 +1555,8 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y -CONFIG_IOMAP_COPY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 1e8988e558c..cbe4e601885 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,8 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -# Make sure this is linked after any other paravirt_ops structs: see head.S +obj-$(CONFIG_VMI) += vmi.o vmitime.o obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-y += pcspeaker.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776d9be26af..f4159e0a7ae 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -36,6 +36,7 @@ #include <asm/hpet.h> #include <asm/i8253.h> #include <asm/nmi.h> +#include <asm/idle.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -1255,6 +1256,7 @@ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ + exit_idle(); irq_enter(); smp_local_timer_interrupt(); irq_exit(); @@ -1305,6 +1307,7 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs) { unsigned long v; + exit_idle(); irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1329,6 +1332,7 @@ fastcall void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; + exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1395,7 +1399,7 @@ int __init APIC_init_uniprocessor (void) if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); #endif - setup_boot_APIC_clock(); + setup_boot_clock(); return 0; } diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index db99a8948da..f9ba0af7ee1 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -211,6 +211,7 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/miscdevice.h> #include <linux/apm_bios.h> #include <linux/init.h> @@ -1636,9 +1637,8 @@ static int do_open(struct inode * inode, struct file * filp) return 0; } -static int apm_get_info(char *buf, char **start, off_t fpos, int length) +static int proc_apm_show(struct seq_file *m, void *v) { - char * p; unsigned short bx; unsigned short cx; unsigned short dx; @@ -1650,8 +1650,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) int time_units = -1; char *units = "?"; - p = buf; - if ((num_online_cpus() == 1) && !(error = apm_get_power_status(&bx, &cx, &dx))) { ac_line_status = (bx >> 8) & 0xff; @@ -1705,7 +1703,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) -1: Unknown 8) min = minutes; sec = seconds */ - p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", driver_version, (apm_info.bios.version >> 8) & 0xff, apm_info.bios.version & 0xff, @@ -1716,10 +1714,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length) percentage, time_units, units); + return 0; +} - return p - buf; +static int proc_apm_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_apm_show, NULL); } +static const struct file_operations apm_file_ops = { + .owner = THIS_MODULE, + .open = proc_apm_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int apm(void *unused) { unsigned short bx; @@ -2341,9 +2351,9 @@ static int __init apm_init(void) set_base(gdt[APM_DS >> 3], __va((unsigned long)apm_info.bios.dseg << 4)); - apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); + apm_proc = create_proc_entry("apm", 0, NULL); if (apm_proc) - apm_proc->owner = THIS_MODULE; + apm_proc->proc_fops = &apm_file_ops; kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 1b2f3cd3327..c37535163bf 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,7 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_GS, pt_regs, xgs); + OFFSET(PT_FS, pt_regs, xfs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 8a8bbdaaf38..dcbbd0a8bfc 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -605,7 +605,7 @@ void __init early_cpu_init(void) struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xgs = __KERNEL_PDA; + regs->xfs = __KERNEL_PDA; return regs; } @@ -662,12 +662,12 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; -static inline void set_kernel_gs(void) +static inline void set_kernel_fs(void) { - /* Set %gs for this CPU's PDA. Memory clobber is to create a + /* Set %fs for this CPU's PDA. Memory clobber is to create a barrier with respect to any PDA operations, so the compiler doesn't move any before here. */ - asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); } /* Initialize the CPU's GDT and PDA. The boot CPU does this for @@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); - set_kernel_gs(); + set_kernel_fs(); } /* Common CPU init for both boot and secondary CPUs */ @@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs. */ - asm volatile ("mov %0, %%fs" : : "r" (0)); + /* Clear %gs. */ + asm volatile ("mov %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index c0c3b59de32..de27bd07bc9 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -6,6 +6,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/timer.h> +#include <asm/pci-direct.h> #include "cpu.h" @@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void) static void __cpuinit geode_configure(void) { unsigned long flags; - u8 ccr3, ccr4; + u8 ccr3; local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ - - ccr4 = getCx86(CX86_CCR4); - ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86(CX86_CCR3, ccr3); + + /* FPU fast, DTE cache, Mem bypass */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); set_cx86_reorder(); @@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void) } -#ifdef CONFIG_PCI -static struct pci_device_id __cpuinitdata cyrix_55x0[] = { - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, - { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, - { }, -}; -#endif - static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; @@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ #ifdef CONFIG_PCI + { + u32 vendor, device; /* It isn't really a PCI quirk directly, but the cure is the same. The MediaGX has deep magic SMM stuff that handles the SB emulation. It thows away the fifo on disable_dma() which @@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); isa_dma_bridge_buggy = 2; + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); /* * The 5510/5520 companion chips have a funky PIT. */ - if (pci_dev_present(cyrix_55x0)) + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) pit_latch_buggy = 1; + } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); - /* GXlv/GXm/GX1 */ - if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f)) geode_configure(); get_model_name(c); /* get CPU marketing name */ return; @@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) if (dir0 == 5 || dir0 == 3) { - unsigned char ccr3, ccr4; + unsigned char ccr3; unsigned long flags; printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ccr4 = getCx86(CX86_CCR4); - setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } } diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index d555bec0db9..4f10c62d180 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -12,6 +12,7 @@ #include <asm/processor.h> #include <asm/system.h> +#include <asm/mce.h> #include "mce.h" diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h index 84fd4cf7d0f..81fb6e2d35f 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.h +++ b/arch/i386/kernel/cpu/mcheck/mce.h @@ -1,4 +1,5 @@ #include <linux/init.h> +#include <asm/mce.h> void amd_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); @@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); -extern int mce_disabled; extern int nr_mce_banks; diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 504434a4601..8359c19d3a2 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -12,6 +12,7 @@ #include <asm/system.h> #include <asm/msr.h> #include <asm/apic.h> +#include <asm/idle.h> #include <asm/therm_throt.h> @@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm fastcall void smp_thermal_interrupt(struct pt_regs *regs) { + exit_idle(); irq_enter(); vendor_thermal_interrupt(regs); irq_exit(); diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index ee771f305f9..c7d8f175674 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) default: return -ENOTTY; case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 0); break; case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 0); break; case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); @@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) break; case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = @@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); break; case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_file_del(sentry.base, sentry.size, file, 1); break; case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = mtrr_del_page(-1, sentry.base, sentry.size); break; case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif if (gentry.regnum >= num_var_ranges) return -EINVAL; mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 16bb7ea8714..0acfb6a5a22 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -50,7 +50,7 @@ u32 num_var_ranges = 0; unsigned int *usage_table; static DEFINE_MUTEX(mtrr_mutex); -u32 size_or_mask, size_and_mask; +u64 size_or_mask, size_and_mask; static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; @@ -662,8 +662,8 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; + size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { /* VIA C* family have Intel style MTRRs, but diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index d61ea9db6cf..289dfe6030e 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -84,7 +84,7 @@ void get_mtrr_state(void); extern void set_mtrr_ops(struct mtrr_ops * ops); -extern u32 size_or_mask, size_and_mask; +extern u64 size_or_mask, size_and_mask; extern struct mtrr_ops * mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 6624d8583c4..47e3ebbfb28 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm", + "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", NULL, - /* nothing */ /* constant_tsc - moved to flags */ + NULL, /* constant_tsc - moved to flags */ + /* nothing */ }; struct cpuinfo_x86 *c = v; int i, n = c - cpu_data; diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 4056fb7d2cd..5678d46863c 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; - unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; get_model_name(c); /* Same as AMD/Cyrix */ @@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) wrmsr(0x80860004, ~0, uk); c->x86_capability[0] = cpuid_edx(0x00000001); wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); /* If we can run i686 user-space code, call us an i686 */ #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index 4da75fa3208..eeae0d99233 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -48,7 +48,6 @@ static struct class *cpuid_class; #ifdef CONFIG_SMP struct cpuid_command { - int cpu; u32 reg; u32 *data; }; @@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block) { struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], + cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); } @@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data) if (cpu == smp_processor_id()) { cpuid(reg, &data[0], &data[1], &data[2], &data[3]); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data = data; - smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); } preempt_enable(); } diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index f391abcf7da..70f39560846 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -14,6 +14,7 @@ #include <asm/pgtable.h> #include <asm/page.h> #include <asm/e820.h> +#include <asm/setup.h> #ifdef CONFIG_EFI int efi_enabled = 0; @@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static int romsignature(const unsigned char *x) +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) { unsigned short sig; - int ret = 0; - if (probe_kernel_address((const unsigned short *)x, sig) == 0) - ret = (sig == 0xaa55); - return ret; + + return probe_kernel_address((const unsigned short *)rom, sig) == 0 && + sig == ROMSIGNATURE; } static int __init romchecksum(unsigned char *rom, unsigned long length) { - unsigned char *p, sum = 0; + unsigned char sum; - for (p = rom; p < rom + length; p++) - sum += *p; + for (sum = 0; length; length--) + sum += *rom++; return sum == 0; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5e47683fc63..18bddcb8e9e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,7 +30,7 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - %gs + * 24(%esp) - %fs * 28(%esp) - orig_eax * 2C(%esp) - %eip * 30(%esp) - %cs @@ -99,9 +99,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ - pushl %gs; \ + pushl %fs; \ CFI_ADJUST_CFA_OFFSET 4;\ - /*CFI_REL_OFFSET gs, 0;*/\ + /*CFI_REL_OFFSET fs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -133,7 +133,7 @@ VM_MASK = 0x00020000 movl %edx, %ds; \ movl %edx, %es; \ movl $(__KERNEL_PDA), %edx; \ - movl %edx, %gs + movl %edx, %fs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -166,9 +166,9 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -3: popl %gs; \ +3: popl %fs; \ CFI_ADJUST_CFA_OFFSET -4;\ - /*CFI_RESTORE gs;*/\ + /*CFI_RESTORE fs;*/\ .pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ jmp 1b; \ @@ -227,6 +227,7 @@ ENTRY(ret_from_fork) CFI_ADJUST_CFA_OFFSET -4 jmp syscall_exit CFI_ENDPROC +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, @@ -258,6 +259,7 @@ ENTRY(resume_userspace) # int/exception return? jne work_pending jmp restore_all +END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) @@ -272,6 +274,7 @@ need_resched: jz restore_all call preempt_schedule_irq jmp need_resched +END(resume_kernel) #endif CFI_ENDPROC @@ -349,16 +352,17 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON -1: mov PT_GS(%esp), %gs +1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC .pushsection .fixup,"ax" -2: movl $0,PT_GS(%esp) +2: movl $0,PT_FS(%esp) jmp 1b .section __ex_table,"a" .align 4 .long 1b,2b .popsection +ENDPROC(sysenter_entry) # system call handler stub ENTRY(system_call) @@ -459,6 +463,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck CFI_ENDPROC +ENDPROC(system_call) # perform work that needs to be done immediately before resumption ALIGN @@ -504,6 +509,7 @@ work_notifysig_v86: xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig +END(work_pending) # perform syscall exit tracing ALIGN @@ -519,6 +525,7 @@ syscall_trace_entry: cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit +END(syscall_trace_entry) # perform syscall exit tracing ALIGN @@ -532,6 +539,7 @@ syscall_exit_work: movl $1, %edx call do_syscall_trace jmp resume_userspace +END(syscall_exit_work) CFI_ENDPROC RING0_INT_FRAME # can't unwind into user space anyway @@ -542,15 +550,17 @@ syscall_fault: GET_THREAD_INFO(%ebp) movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace +END(syscall_fault) syscall_badsys: movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace +END(syscall_badsys) CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - movl %gs:PDA_cpu, %ebx; \ + movl %fs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ @@ -581,9 +591,9 @@ syscall_badsys: ENTRY(interrupt) .text -vector=0 ENTRY(irq_entries_start) RING0_INT_FRAME +vector=0 .rept NR_IRQS ALIGN .if vector @@ -592,11 +602,16 @@ ENTRY(irq_entries_start) 1: pushl $~(vector) CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt -.data + .previous .long 1b -.text + .text vector=vector+1 .endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous /* * the CPU automatically disables interrupts when executing an IRQ vector, @@ -609,6 +624,7 @@ common_interrupt: movl %esp,%eax call do_IRQ jmp ret_from_intr +ENDPROC(common_interrupt) CFI_ENDPROC #define BUILD_INTERRUPT(name, nr) \ @@ -621,18 +637,24 @@ ENTRY(name) \ movl %esp,%eax; \ call smp_/**/name; \ jmp ret_from_intr; \ - CFI_ENDPROC + CFI_ENDPROC; \ +ENDPROC(name) /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" +/* This alternate entry is needed because we hijack the apic LVTT */ +#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) +BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) +#endif + KPROBE_ENTRY(page_fault) RING0_EC_FRAME pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: - /* the function address is in %gs's slot on the stack */ + /* the function address is in %fs's slot on the stack */ pushl %es CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET es, 0*/ @@ -661,20 +683,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %gs + pushl %fs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET gs, 0*/ + /*CFI_REL_OFFSET fs, 0*/ movl $(__KERNEL_PDA), %ecx - movl %ecx, %gs + movl %ecx, %fs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_GS(%esp), %edi # get the function address + movl PT_FS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_GS(%esp) - /*CFI_REL_OFFSET gs, ES*/ + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -692,6 +714,7 @@ ENTRY(coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME @@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME @@ -721,6 +745,7 @@ device_not_available_emulate: CFI_ADJUST_CFA_OFFSET -4 jmp ret_from_exception CFI_ENDPROC +END(device_not_available) /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -864,10 +889,12 @@ ENTRY(native_iret) .align 4 .long 1b,iret_exc .previous +END(native_iret) ENTRY(native_irq_enable_sysexit) sti sysexit +END(native_irq_enable_sysexit) #endif KPROBE_ENTRY(int3) @@ -890,6 +917,7 @@ ENTRY(overflow) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(overflow) ENTRY(bounds) RING0_INT_FRAME @@ -899,6 +927,7 @@ ENTRY(bounds) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(bounds) ENTRY(invalid_op) RING0_INT_FRAME @@ -908,6 +937,7 @@ ENTRY(invalid_op) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME @@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME @@ -924,6 +955,7 @@ ENTRY(invalid_TSS) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME @@ -931,6 +963,7 @@ ENTRY(segment_not_present) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME @@ -938,6 +971,7 @@ ENTRY(stack_segment) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(stack_segment) KPROBE_ENTRY(general_protection) RING0_EC_FRAME @@ -953,6 +987,7 @@ ENTRY(alignment_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME @@ -962,6 +997,7 @@ ENTRY(divide_error) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) @@ -972,6 +1008,7 @@ ENTRY(machine_check) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(machine_check) #endif ENTRY(spurious_interrupt_bug) @@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC +END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index cb9abdfced9..3fa7f9389af 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -53,6 +53,7 @@ * any particular GDT layout, because we load our own as soon as we * can. */ +.section .text.head,"ax",@progbits ENTRY(startup_32) #ifdef CONFIG_PARAVIRT @@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20); jb 10b movl %edi,(init_pg_tables_end - __PAGE_OFFSET) -#ifdef CONFIG_SMP xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f - /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but * we know the trampoline has already loaded the boot_gdt_table GDT * for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later */ + +#ifdef CONFIG_HOTPLUG_CPU +.section .text,"ax",@progbits +#else +.section .init.text,"ax",@progbits +#endif + +#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -208,8 +218,8 @@ ENTRY(startup_32_smp) xorl %ebx,%ebx incl %ebx -3: #endif /* CONFIG_SMP */ +3: /* * Enable paging @@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP call check_x87 call setup_pda - lgdt cpu_gdt_descr + lgdt early_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f 1: movl $(__KERNEL_DS),%eax # reload all the segment registers @@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS and LDT - movl %eax,%fs + xorl %eax,%eax # Clear GS and LDT + movl %eax,%gs lldt %ax movl $(__KERNEL_PDA),%eax - mov %eax,%gs + mov %eax,%fs cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder @@ -360,12 +370,12 @@ check_x87: * cpu_gdt_table and boot_pda; for secondary CPUs, these will be * that CPU's GDT and PDA. */ -setup_pda: +ENTRY(setup_pda) /* get the PDA pointer */ movl start_pda, %eax /* slot the PDA address into the GDT */ - mov cpu_gdt_descr+2, %ecx + mov early_gdt_descr+2, %ecx mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ shr $16, %eax mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ @@ -492,6 +502,7 @@ ignore_int: #endif iret +.section .text #ifdef CONFIG_PARAVIRT startup_paravirt: cld @@ -502,10 +513,11 @@ startup_paravirt: pushl %ecx pushl %eax - /* paravirt.o is last in link, and that probe fn never returns */ pushl $__start_paravirtprobe 1: movl 0(%esp), %eax + cmpl $__stop_paravirtprobe, %eax + je unhandled_paravirt pushl (%eax) movl 8(%esp), %eax call *(%esp) @@ -517,6 +529,10 @@ startup_paravirt: addl $4, (%esp) jmp 1b + +unhandled_paravirt: + /* Nothing wanted us: we're screwed. */ + ud2 #endif /* @@ -581,7 +597,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -ENTRY(cpu_gdt_descr) +ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index ba8d302a0b7..e30ccedad0b 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from_mpc(void) static void __init setup_ioapic_ids_from_mpc(void) { } #endif -static int no_timer_check __initdata; +int no_timer_check __initdata; static int __init notimercheck(char *s) { @@ -2310,7 +2310,7 @@ static inline void __init check_timer(void) disable_8259A_irq(0); set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, - "fasteio"); + "fasteoi"); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 3201d421090..5785d84103a 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -19,6 +19,8 @@ #include <linux/cpu.h> #include <linux/delay.h> +#include <asm/idle.h> + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -61,6 +63,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) union irq_ctx *curctx, *irqctx; u32 *isp; #endif + exit_idle(); if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index af1d5334499..b545bc746fc 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -363,7 +363,7 @@ no_kprobe: " pushf\n" /* skip cs, eip, orig_eax */ " subl $12, %esp\n" - " pushl %gs\n" + " pushl %fs\n" " pushl %ds\n" " pushl %es\n" " pushl %eax\n" @@ -387,7 +387,7 @@ no_kprobe: " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* skip eip, orig_eax, es, ds, gs */ + /* skip eip, orig_eax, es, ds, fs */ " addl $20, %esp\n" " popf\n" " ret\n"); @@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); /* fixup registers */ - regs->xcs = __KERNEL_CS; + regs->xcs = __KERNEL_CS | get_kernel_rpl(); regs->eip = trampoline_address; regs->orig_eax = 0xffffffff; diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index 381252bae3d..b8f16633a6e 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -384,7 +384,7 @@ static int do_microcode_update (void) { long cursor = 0; int error = 0; - void *new_mc; + void *new_mc = NULL; int cpu; cpumask_t old; diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 4e14264f392..bcaa6e9b619 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) #ifdef CONFIG_SMP struct msr_command { - int cpu; int err; u32 reg; u32 data[2]; @@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); } static void msr_smp_rdmsr(void *cmd_block) { struct msr_command *cmd = (struct msr_command *)cmd_block; - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); } static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) @@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) if (cpu == smp_processor_id()) { ret = wrmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; cmd.data[0] = eax; cmd.data[1] = edx; - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1); ret = cmd.err; } preempt_enable(); @@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) if (cpu == smp_processor_id()) { ret = rdmsr_eio(reg, eax, edx); } else { - cmd.cpu = cpu; cmd.reg = reg; - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1); *eax = cmd.data[0]; *edx = cmd.data[1]; diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1a6f8bb8881..5d8a07c2028 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6) + || (boot_cpu_data.x86 == 16)); case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -216,6 +217,28 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + u64 counter_val; + unsigned int retval = hz; + + /* + * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + counter_val = (u64)cpu_khz * 1000; + do_div(counter_val, retval); + if (counter_val > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + retval = count + 1; + } + return retval; +} + static int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -281,18 +304,10 @@ static int __init check_nmi_watchdog(void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - nmi_hz = count + 1; + + if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } } @@ -369,6 +384,34 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -442,6 +485,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) wrmsrl(perfctr_msr, 0 - count); } +static void write_watchdog_counter32(unsigned int perfctr_msr, + const char *descr) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsr(perfctr_msr, (u32)(-count), 0); +} + /* Note that these events don't tick when the CPU idles. This means the frequency varies with CPU load. */ @@ -531,7 +585,8 @@ static int setup_p6_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -704,7 +759,8 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsr(evntsel_msr, evntsel, 0); @@ -762,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused) if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && + boot_cpu_data.x86 != 16) return; if (!setup_k7_watchdog()) return; @@ -956,6 +1013,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { @@ -964,9 +1023,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * other P6 variant. * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* P6/ARCH_PERFMON has 32 bit counter write */ + write_watchdog_counter32(wd->perfctr_msr, NULL); + } else { + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); } - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index e55fd05da0f..c156ecfa387 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) return insn_len; } -static fastcall unsigned long native_get_debugreg(int regno) +static unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno) return val; } -static fastcall void native_set_debugreg(int regno, unsigned long value) +static void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -146,55 +146,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } -static fastcall void native_clts(void) +static void native_clts(void) { asm volatile ("clts"); } -static fastcall unsigned long native_read_cr0(void) +static unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr0(unsigned long val) +static void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } -static fastcall unsigned long native_read_cr2(void) +static unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr2(unsigned long val) +static void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } -static fastcall unsigned long native_read_cr3(void) +static unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); return val; } -static fastcall void native_write_cr3(unsigned long val) +static void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } -static fastcall unsigned long native_read_cr4(void) +static unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); return val; } -static fastcall unsigned long native_read_cr4_safe(void) +static unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void) return val; } -static fastcall void native_write_cr4(unsigned long val) +static void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } -static fastcall unsigned long native_save_fl(void) +static unsigned long native_save_fl(void) { unsigned long f; asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); return f; } -static fastcall void native_restore_fl(unsigned long f) +static void native_restore_fl(unsigned long f) { asm volatile("pushl %0 ; popfl": /* no output */ :"g" (f) :"memory", "cc"); } -static fastcall void native_irq_disable(void) +static void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static fastcall void native_irq_enable(void) +static void native_irq_enable(void) { asm volatile("sti": : :"memory"); } -static fastcall void native_safe_halt(void) +static void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static fastcall void native_halt(void) +static void native_halt(void) { asm volatile("hlt": : :"memory"); } -static fastcall void native_wbinvd(void) +static void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +static unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; @@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) return val; } -static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +static int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val) return err; } -static fastcall unsigned long long native_read_tsc(void) +static unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=A" (val)); return val; } -static fastcall unsigned long long native_read_pmc(void) +static unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=A" (val)); return val; } -static fastcall void native_load_tr_desc(void) +static void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } -static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +static void native_load_gdt(const struct Xgt_desc_struct *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); } -static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +static void native_load_idt(const struct Xgt_desc_struct *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } -static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +static void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=m" (*dtr)); } -static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +static void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=m" (*dtr)); } -static fastcall unsigned long native_store_tr(void) +static unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=r" (tr)); return tr; } -static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +static void native_load_tls(struct thread_struct *t, unsigned int cpu) { #define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] C(0); C(1); C(2); @@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 lp[1] = entry_high; } -static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static fastcall void native_load_esp0(struct tss_struct *tss, +static void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss, } } -static fastcall void native_io_delay(void) +static void native_io_delay(void) { asm volatile("outb %al,$0x80"); } -static fastcall void native_flush_tlb(void) +static void native_flush_tlb(void) { __native_flush_tlb(); } @@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void) * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -static fastcall void native_flush_tlb_global(void) +static void native_flush_tlb_global(void) { __native_flush_tlb_global(); } -static fastcall void native_flush_tlb_single(u32 addr) +static void native_flush_tlb_single(u32 addr) { __native_flush_tlb_single(addr); } #ifndef CONFIG_X86_PAE -static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +static void native_set_pte(pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; } #else /* CONFIG_X86_PAE */ -static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +static void native_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { ptep->pte_low = 0; smp_wmb(); @@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long ptep->pte_low = pte.pte_low; } -static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) { set_64bit((unsigned long long *)ptep,pte_val(pteval)); } -static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); } -static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +static void native_set_pud(pud_t *pudp, pud_t pudval) { *pudp = pudval; } -static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; } -static fastcall void native_pmd_clear(pmd_t *pmd) +static void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; *tmp = 0; @@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd) #endif /* CONFIG_X86_PAE */ /* These are in entry.S */ -extern fastcall void native_iret(void); -extern fastcall void native_irq_enable_sysexit(void); +extern void native_iret(void); +extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { @@ -482,9 +482,6 @@ static int __init print_banner(void) } core_initcall(print_banner); -/* We simply declare start_kernel to be the paravirt probe of last resort. */ -paravirt_probe(start_kernel); - struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = { .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, + .setup_boot_clock = setup_boot_APIC_clock, + .setup_secondary_clock = setup_secondary_APIC_clock, #endif + .set_lazy_mode = (void *)native_nop, .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, + .alloc_pt = (void *)native_nop, + .alloc_pd = (void *)native_nop, + .alloc_pd_clone = (void *)native_nop, + .release_pt = (void *)native_nop, + .release_pd = (void *)native_nop, + .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, @@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = { .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, + + .startup_ipi_hook = (void *)native_nop, }; /* diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c new file mode 100644 index 00000000000..bc1f2d3ea27 --- /dev/null +++ b/arch/i386/kernel/pcspeaker.c @@ -0,0 +1,20 @@ +#include <linux/platform_device.h> +#include <linux/errno.h> +#include <linux/init.h> + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index c641056233a..7845d480c29 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -48,6 +48,7 @@ #include <asm/i387.h> #include <asm/desc.h> #include <asm/vm86.h> +#include <asm/idle.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> #endif @@ -80,6 +81,42 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} + +static DEFINE_PER_CPU(volatile unsigned long, idle_state); + +void enter_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + __set_bit(0, &__get_cpu_var(idle_state)); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + /* needs to be atomic w.r.t. interrupts, not against other CPUs */ + if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +void exit_idle(void) +{ + if (current->pid) + return; + __exit_idle(); +} + void disable_hlt(void) { hlt_counter++; @@ -130,6 +167,7 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { + local_irq_enable(); cpu_relax(); } @@ -189,7 +227,16 @@ void cpu_idle(void) play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; + + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); + enter_idle(); idle(); + __exit_idle(); } preempt_enable_no_resched(); schedule(); @@ -243,7 +290,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __sti_mwait(eax, ecx); + else + local_irq_enable(); + } else { + local_irq_enable(); } } @@ -308,8 +359,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x GS: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); + printk(" DS: %04x ES: %04x FS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -340,7 +391,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; - regs.xgs = __KERNEL_PDA; + regs.xfs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -425,7 +476,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; - savesegment(fs,p->thread.fs); + savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -501,8 +552,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.eax = regs->eax; dump->regs.ds = regs->xds; dump->regs.es = regs->xes; - savesegment(fs,dump->regs.fs); - dump->regs.gs = regs->xgs; + dump->regs.fs = regs->xfs; + savesegment(gs,dump->regs.gs); dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; @@ -653,7 +704,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs. No need to save %gs, as it was saved on the + * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation @@ -662,7 +713,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - savesegment(fs, prev->fs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -670,14 +721,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs if needed. - * - * Glibc normally makes %fs be zero. + * Restore IOPL if needed. In normal use, the flags restore + * in the switch assembly will handle this. But if the kernel + * is running virtualized at a non-zero CPL, the popf will + * not restore flags, so it must be done in a separate step. */ - if (unlikely(prev->fs | next->fs)) - loadsegment(fs, next->fs); - - write_pda(pcurrent, next_p); + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); /* * Now maybe handle debug registers and/or IO bitmaps @@ -688,6 +738,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_leave_lazy_cpu_mode(); + /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now @@ -695,6 +754,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (next_p->fpu_counter > 5) math_state_restore(); + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + + write_pda(pcurrent, next_p); + return prev_p; } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index af8aabe8580..4a8f8a25972 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -89,14 +89,14 @@ static int putreg(struct task_struct *child, unsigned long regno, unsigned long value) { switch (regno >> 2) { - case FS: + case GS: if (value && (value & 3) != 3) return -EIO; - child->thread.fs = value; + child->thread.gs = value; return 0; case DS: case ES: - case GS: + case FS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -112,7 +112,7 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; put_stack_long(child, regno, value); return 0; @@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child, unsigned long retval = ~0UL; switch (regno >> 2) { - case FS: - retval = child->thread.fs; + case GS: + retval = child->thread.gs; break; case DS: case ES: - case GS: + case FS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > ES*4) + if (regno > FS*4) regno -= 1*4; retval &= get_stack_long(child, regno); } diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 4694ac980cd..122623dcc6e 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -33,7 +33,6 @@ #include <linux/initrd.h> #include <linux/bootmem.h> #include <linux/seq_file.h> -#include <linux/platform_device.h> #include <linux/console.h> #include <linux/mca.h> #include <linux/root_dev.h> @@ -60,6 +59,7 @@ #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> +#include <asm/vmi.h> #include <setup_arch.h> #include <bios_ebda.h> @@ -581,6 +581,14 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = setup_memory(); +#ifdef CONFIG_VMI + /* + * Must be after max_low_pfn is determined, and before kernel + * pagetables are setup. + */ + vmi_init(); +#endif + /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the @@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p) #endif tsc_init(); } - -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); - -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:8 - * End: - */ diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 65d7620eaa0..4f99e870c98 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <linux/ptrace.h> #include <linux/elf.h> +#include <linux/binfmts.h> #include <asm/processor.h> #include <asm/ucontext.h> #include <asm/uaccess.h> @@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - COPY_SEG(gs); - GET_SEG(fs); + GET_SEG(gs); + COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); COPY(edi); @@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); - savesegment(fs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + savesegment(gs, tmp); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); @@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + if (current->binfmt->hasvdso) + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 5285aff8367..9bd9637ae69 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -23,6 +23,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/idle.h> #include <mach_apic.h> /* @@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. + * AK: x86-64 has a faster method that could be ported. */ spin_lock(&tlbstate_lock); @@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) /* * At this point the info structure may be out of scope unless wait==1 */ + exit_idle(); irq_enter(); (*func)(info); irq_exit(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8c6c8c52b95..f46a4d095e6 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -63,6 +63,7 @@ #include <mach_apic.h> #include <mach_wakecpu.h> #include <smpboot_hooks.h> +#include <asm/vmi.h> /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -545,12 +546,15 @@ static void __cpuinit start_secondary(void *unused) * booting is too fragile that we want to limit the * things done here to the most necessary things. */ +#ifdef CONFIG_VMI + vmi_bringup(); +#endif secondary_cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); - setup_secondary_APIC_clock(); + setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); enable_NMI_through_LVT0(NULL); @@ -619,7 +623,6 @@ extern struct { unsigned short ss; } stack_start; extern struct i386_pda *start_pda; -extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA @@ -835,6 +838,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) num_starts = 0; /* + * Paravirt / VMI wants a startup IPI hook here to set up the + * target processor state. + */ + startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, + (unsigned long) stack_start.esp); + + /* * Run STARTUP IPI loop. */ Dprintk("#startup loops: %d.\n", num_starts); @@ -1320,7 +1330,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) smpboot_setup_io_apic(); - setup_boot_APIC_clock(); + setup_boot_clock(); /* * Synchronize the TSC with the AP diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index bc882a2b1db..13ca54a85a1 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -78,7 +78,7 @@ int __init sysenter_setup(void) syscall_pages[0] = virt_to_page(syscall_page); #ifdef CONFIG_COMPAT_VDSO - __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC); printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); #endif diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index c505b16c099..a4f67a6e682 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); #else - unsigned long *sp; - if ((regs->xcs & 3) == 0) - sp = (unsigned long *)®s->esp; - else - sp = (unsigned long *)regs->esp; + unsigned long *sp = (unsigned long *)®s->esp; + /* Return address is either directly at stack pointer or above a saved eflags. Eflags has bits 22-31 zero, kernel addresses don't. */ @@ -232,6 +230,7 @@ EXPORT_SYMBOL(get_cmos_time); static void sync_cmos_clock(unsigned long dummy); static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +int no_sync_cmos_clock; static void sync_cmos_clock(unsigned long dummy) { @@ -275,7 +274,8 @@ static void sync_cmos_clock(unsigned long dummy) void notify_arch_cmos_timer(void) { - mod_timer(&sync_cmos_timer, jiffies + 1); + if (!no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); } static long clock_cmos_diff; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 0efad8aeb41..af0d3f70a81 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); int kstack_depth_to_print = 24; +static unsigned int code_bytes = 64; ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = 1; unsigned long esp; - unsigned short ss; + unsigned short ss, gs; esp = (unsigned long) (®s->esp); savesegment(ss, ss); + savesegment(gs, gs); if (user_mode_vm(regs)) { in_kernel = 0; esp = regs->esp; @@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs) regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); @@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs) */ if (in_kernel) { u8 *eip; - int code_bytes = 64; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); @@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - 43; + eip = (u8 *)regs->eip - code_prologue; if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { /* try starting at EIP */ eip = (u8 *)regs->eip; - code_bytes = 32; + code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_bytes; i++, eip++) { + for (i = 0; i < code_len; i++, eip++) { if (eip < (u8 *)PAGE_OFFSET || probe_kernel_address(eip, c)) { printk(" Bad EIP value."); @@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 2cfc7b09b92..46f752a8bbf 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -23,6 +23,7 @@ * an extra value to store the TSC freq */ unsigned int tsc_khz; +unsigned long long (*custom_sched_clock)(void); int tsc_disable; @@ -107,14 +108,14 @@ unsigned long long sched_clock(void) { unsigned long long this_offset; + if (unlikely(custom_sched_clock)) + return (*custom_sched_clock)(); + /* - * in the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. + * Fall back to jiffies if there's no TSC available: */ -#ifndef CONFIG_NUMA - if (!cpu_khz || check_tsc_unstable()) -#endif - /* no locking but a rare wrong value is not a big deal */ + if (unlikely(tsc_disable)) + /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ @@ -194,13 +195,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz); void __init tsc_init(void) { if (!cpu_has_tsc || tsc_disable) - return; + goto out_no_tsc; cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; if (!cpu_khz) - return; + goto out_no_tsc; printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, @@ -208,6 +209,15 @@ void __init tsc_init(void) set_cyc2ns_scale(cpu_khz); use_tsc_delay(); + return; + +out_no_tsc: + /* + * Set the tsc_disable flag if there's no TSC support, this + * makes it a fast flag for the kernel to see whether it + * should be using the TSC. + */ + tsc_disable = 1; } #ifdef CONFIG_CPU_FREQ diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index be2f96e67f7..d1b8f2b7aea 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xfs, so copy everything up to - (but not including) xgs, and then rest after xgs. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + /* kernel_vm86_regs is missing xgs, so copy everything up to + (but not including) orig_eax, and then rest including orig_eax. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs)); + offsetof(struct kernel_vm86_regs, pt.orig_eax)); return ret; } @@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); - ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + /* copy eax-xfs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); + /* copy orig_eax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.xgs) + + offsetof(struct kernel_vm86_regs, pt.orig_eax) + extra); - return ret; } @@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) ret = KVM86->regs32; - loadsegment(fs, current->thread.saved_fs); - ret->xgs = current->thread.saved_gs; + ret->xfs = current->thread.saved_fs; + loadsegment(gs, current->thread.saved_gs); return ret; } @@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs.pt.xds = 0; info->regs.pt.xes = 0; - info->regs.pt.xgs = 0; + info->regs.pt.xfs = 0; -/* we are clearing fs later just before "jmp resume_userspace", +/* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ @@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; - savesegment(fs, tsk->thread.saved_fs); - tsk->thread.saved_gs = info->regs32->xgs; + tsk->thread.saved_fs = info->regs32->xfs; + savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" - "mov %2, %%fs\n\t" + "mov %2, %%gs\n\t" "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c new file mode 100644 index 00000000000..bb5a7abf949 --- /dev/null +++ b/arch/i386/kernel/vmi.c @@ -0,0 +1,949 @@ +/* + * VMI specific paravirt-ops implementation + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to zach@vmware.com + * + */ + +#include <linux/module.h> +#include <linux/license.h> +#include <linux/cpu.h> +#include <linux/bootmem.h> +#include <linux/mm.h> +#include <asm/vmi.h> +#include <asm/io.h> +#include <asm/fixmap.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/timer.h> +#include <asm/vmi_time.h> + +/* Convenient for calling VMI functions indirectly in the ROM */ +typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); +typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int); + +#define call_vrom_func(rom,func) \ + (((VROMFUNC *)(rom->func))()) + +#define call_vrom_long_func(rom,func,arg) \ + (((VROMLONGFUNC *)(rom->func)) (arg)) + +static struct vrom_header *vmi_rom; +static int license_gplok; +static int disable_nodelay; +static int disable_pge; +static int disable_pse; +static int disable_sep; +static int disable_tsc; +static int disable_mtrr; + +/* Cached VMI operations */ +struct { + void (*cpuid)(void /* non-c */); + void (*_set_ldt)(u32 selector); + void (*set_tr)(u32 selector); + void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*allocate_page)(u32, u32, u32, u32, u32); + void (*release_page)(u32, u32); + void (*set_pte)(pte_t, pte_t *, unsigned); + void (*update_pte)(pte_t *, unsigned); + void (*set_linear_mapping)(int, u32, u32, u32); + void (*flush_tlb)(int); + void (*set_initial_ap_state)(int, int); + void (*halt)(void); +} vmi_ops; + +/* XXX move this to alternative.h */ +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; + +/* + * VMI patching routines. + */ +#define MNEM_CALL 0xe8 +#define MNEM_JMP 0xe9 +#define MNEM_RET 0xc3 + +static char irq_save_disable_callout[] = { + MNEM_CALL, 0, 0, 0, 0, + MNEM_CALL, 0, 0, 0, 0, + MNEM_RET +}; +#define IRQ_PATCH_INT_MASK 0 +#define IRQ_PATCH_DISABLE 5 + +static inline void patch_offset(unsigned char *eip, unsigned char *dest) +{ + *(unsigned long *)(eip+1) = dest-eip-5; +} + +static unsigned patch_internal(int call, unsigned len, void *insns) +{ + u64 reloc; + struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, call); + switch(rel->type) { + case VMI_RELOCATION_CALL_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_JUMP_REL: + BUG_ON(len < 5); + *(char *)insns = MNEM_JMP; + patch_offset(insns, rel->eip); + return 5; + + case VMI_RELOCATION_NOP: + /* obliterate the whole thing */ + return 0; + + case VMI_RELOCATION_NONE: + /* leave native code in place */ + break; + + default: + BUG(); + } + return len; +} + +/* + * Apply patch if appropriate, return length of new instruction + * sequence. The callee does nop padding for us. + */ +static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + switch (type) { + case PARAVIRT_IRQ_DISABLE: + return patch_internal(VMI_CALL_DisableInterrupts, len, insns); + case PARAVIRT_IRQ_ENABLE: + return patch_internal(VMI_CALL_EnableInterrupts, len, insns); + case PARAVIRT_RESTORE_FLAGS: + return patch_internal(VMI_CALL_SetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS: + return patch_internal(VMI_CALL_GetInterruptMask, len, insns); + case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE: + if (len >= 10) { + patch_internal(VMI_CALL_GetInterruptMask, len, insns); + patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5); + return 10; + } else { + /* + * You bastards didn't leave enough room to + * patch save_flags_irq_disable inline. Patch + * to a helper + */ + BUG_ON(len < 5); + *(char *)insns = MNEM_CALL; + patch_offset(insns, irq_save_disable_callout); + return 5; + } + case PARAVIRT_INTERRUPT_RETURN: + return patch_internal(VMI_CALL_IRET, len, insns); + case PARAVIRT_STI_SYSEXIT: + return patch_internal(VMI_CALL_SYSEXIT, len, insns); + default: + break; + } + return len; +} + +/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ +static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int override = 0; + if (*eax == 1) + override = 1; + asm volatile ("call *%6" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + if (override) { + if (disable_pse) + *edx &= ~X86_FEATURE_PSE; + if (disable_pge) + *edx &= ~X86_FEATURE_PGE; + if (disable_sep) + *edx &= ~X86_FEATURE_SEP; + if (disable_tsc) + *edx &= ~X86_FEATURE_TSC; + if (disable_mtrr) + *edx &= ~X86_FEATURE_MTRR; + } +} + +static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) +{ + if (gdt[nr].a != new->a || gdt[nr].b != new->b) + write_gdt_entry(gdt, nr, new->a, new->b); +} + +static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]); + vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]); +} + +static void vmi_set_ldt(const void *addr, unsigned entries) +{ + unsigned cpu = smp_processor_id(); + u32 low, high; + + pack_descriptor(&low, &high, (unsigned long)addr, + entries * sizeof(struct desc_struct) - 1, + DESCTYPE_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); +} + +static void vmi_set_tr(void) +{ + vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); +} + +static void vmi_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0); +} + +static void vmi_flush_tlb_user(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB); +} + +static void vmi_flush_tlb_kernel(void) +{ + vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL); +} + +/* Stub to do nothing at all; used for delays and unimplemented calls */ +static void vmi_nop(void) +{ +} + +/* For NO_IDLE_HZ, we stop the clock when halting the kernel */ +#ifdef CONFIG_NO_IDLE_HZ +static fastcall void vmi_safe_halt(void) +{ + int idle = vmi_stop_hz_timer(); + vmi_ops.halt(); + if (idle) { + local_irq_disable(); + vmi_account_time_restart_hz_timer(); + local_irq_enable(); + } +} +#endif + +#ifdef CONFIG_DEBUG_PAGE_TYPE + +#ifdef CONFIG_X86_PAE +#define MAX_BOOT_PTS (2048+4+1) +#else +#define MAX_BOOT_PTS (1024+1) +#endif + +/* + * During boot, mem_map is not yet available in paging_init, so stash + * all the boot page allocations here. + */ +static struct { + u32 pfn; + int type; +} boot_page_allocations[MAX_BOOT_PTS]; +static int num_boot_page_allocations; +static int boot_allocations_applied; + +void vmi_apply_boot_page_allocations(void) +{ + int i; + BUG_ON(!mem_map); + for (i = 0; i < num_boot_page_allocations; i++) { + struct page *page = pfn_to_page(boot_page_allocations[i].pfn); + page->type = boot_page_allocations[i].type; + page->type = boot_page_allocations[i].type & + ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + } + boot_allocations_applied = 1; +} + +static void record_page_type(u32 pfn, int type) +{ + BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS); + boot_page_allocations[num_boot_page_allocations].pfn = pfn; + boot_page_allocations[num_boot_page_allocations].type = type; + num_boot_page_allocations++; +} + +static void check_zeroed_page(u32 pfn, int type, struct page *page) +{ + u32 *ptr; + int i; + int limit = PAGE_SIZE / sizeof(int); + + if (page_address(page)) + ptr = (u32 *)page_address(page); + else + ptr = (u32 *)__va(pfn << PAGE_SHIFT); + /* + * When cloning the root in non-PAE mode, only the userspace + * pdes need to be zeroed. + */ + if (type & VMI_PAGE_CLONE) + limit = USER_PTRS_PER_PGD; + for (i = 0; i < limit; i++) + BUG_ON(ptr[i]); +} + +/* + * We stash the page type into struct page so we can verify the page + * types are used properly. + */ +static void vmi_set_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - don't track */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + if (type != VMI_PAGE_NORMAL) + BUG_ON(page->type); + else + BUG_ON(page->type == VMI_PAGE_NORMAL); + page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (type & VMI_PAGE_ZEROED) + check_zeroed_page(pfn, type, page); + } else { + record_page_type(pfn, type); + } +} + +static void vmi_check_page_type(u32 pfn, int type) +{ + /* PAE can have multiple roots per page - skip checks */ + if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP)) + return; + + type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE); + if (boot_allocations_applied) { + struct page *page = pfn_to_page(pfn); + BUG_ON((page->type ^ type) & VMI_PAGE_PAE); + BUG_ON(type == VMI_PAGE_NORMAL && page->type); + BUG_ON((type & page->type) == 0); + } +} +#else +#define vmi_set_page_type(p,t) do { } while (0) +#define vmi_check_page_type(p,t) do { } while (0) +#endif + +static void vmi_allocate_pt(u32 pfn) +{ + vmi_set_page_type(pfn, VMI_PAGE_L1); + vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); +} + +static void vmi_allocate_pd(u32 pfn) +{ + /* + * This call comes in very early, before mem_map is setup. + * It is called only for swapper_pg_dir, which already has + * data on it. + */ + vmi_set_page_type(pfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); +} + +static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +{ + vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); + vmi_check_page_type(clonepfn, VMI_PAGE_L2); + vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); +} + +static void vmi_release_pt(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L1); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +static void vmi_release_pd(u32 pfn) +{ + vmi_ops.release_page(pfn, VMI_PAGE_L2); + vmi_set_page_type(pfn, VMI_PAGE_NORMAL); +} + +/* + * Helper macros for MMU update flags. We can defer updates until a flush + * or page invalidation only if the update is to the current address space + * (otherwise, there is no flush). We must check against init_mm, since + * this could be a kernel update, which usually passes init_mm, although + * sometimes this check can be skipped if we know the particular function + * is only called on user mode PTEs. We could change the kernel to pass + * current->active_mm here, but in particular, I was unsure if changing + * mm/highmem.c to do this would still be correct on other architectures. + */ +#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \ + (!mustbeuser && (mm) == &init_mm)) +#define vmi_flags_addr(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) +#define vmi_flags_addr_defer(mm, addr, level, user) \ + ((level) | (is_current_as(mm, user) ? \ + (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0)) + +static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pte(pte_t *ptep, pte_t pte) +{ + /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD); + vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ +#ifdef CONFIG_X86_PAE + const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); +#else + const pte_t pte = { pmdval.pud.pgd.pgd }; + vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD); +#endif + vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); +} + +#ifdef CONFIG_X86_PAE + +static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + /* + * XXX This is called from set_pmd_pte, but at both PT + * and PD layers so the VMI_PAGE_PT flag is wrong. But + * it is only called for large page mapping changes, + * the Xen backend, doesn't support large pages, and the + * ESX backend doesn't depend on the flag. + */ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); + vmi_ops.update_pte(ptep, VMI_PAGE_PT); +} + +static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); +} + +static void vmi_set_pud(pud_t *pudp, pud_t pudval) +{ + /* Um, eww */ + const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); + vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); +} + +static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); + vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); +} + +void vmi_pmd_clear(pmd_t *pmd) +{ + const pte_t pte = { 0 }; + vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); + vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); +} +#endif + +#ifdef CONFIG_SMP +struct vmi_ap_state ap; +extern void setup_pda(void); + +static void __init /* XXX cpu hotplug */ +vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, + unsigned long start_esp) +{ + /* Default everything to zero. This is fine for most GPRs. */ + memset(&ap, 0, sizeof(struct vmi_ap_state)); + + ap.gdtr_limit = GDT_SIZE - 1; + ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid); + + ap.idtr_limit = IDT_ENTRIES * 8 - 1; + ap.idtr_base = (unsigned long) idt_table; + + ap.ldtr = 0; + + ap.cs = __KERNEL_CS; + ap.eip = (unsigned long) start_eip; + ap.ss = __KERNEL_DS; + ap.esp = (unsigned long) start_esp; + + ap.ds = __USER_DS; + ap.es = __USER_DS; + ap.fs = __KERNEL_PDA; + ap.gs = 0; + + ap.eflags = 0; + + setup_pda(); + +#ifdef CONFIG_X86_PAE + /* efer should match BSP efer. */ + if (cpu_has_nx) { + unsigned l, h; + rdmsr(MSR_EFER, l, h); + ap.efer = (unsigned long long) h << 32 | l; + } +#endif + + ap.cr3 = __pa(swapper_pg_dir); + /* Protected mode, paging, AM, WP, NE, MP. */ + ap.cr0 = 0x80050023; + ap.cr4 = mmu_cr4_features; + vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid); +} +#endif + +static inline int __init check_vmi_rom(struct vrom_header *rom) +{ + struct pci_header *pci; + struct pnp_header *pnp; + const char *manufacturer = "UNKNOWN"; + const char *product = "UNKNOWN"; + const char *license = "unspecified"; + + if (rom->rom_signature != 0xaa55) + return 0; + if (rom->vrom_signature != VMI_SIGNATURE) + return 0; + if (rom->api_version_maj != VMI_API_REV_MAJOR || + rom->api_version_min+1 < VMI_API_REV_MINOR+1) { + printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n", + rom->api_version_maj, + rom->api_version_min); + return 0; + } + + /* + * Relying on the VMI_SIGNATURE field is not 100% safe, so check + * the PCI header and device type to make sure this is really a + * VMI device. + */ + if (!rom->pci_header_offs) { + printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n"); + return 0; + } + + pci = (struct pci_header *)((char *)rom+rom->pci_header_offs); + if (pci->vendorID != PCI_VENDOR_ID_VMWARE || + pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) { + /* Allow it to run... anyways, but warn */ + printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n"); + } + + if (rom->pnp_header_offs) { + pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs); + if (pnp->manufacturer_offset) + manufacturer = (const char *)rom+pnp->manufacturer_offset; + if (pnp->product_offset) + product = (const char *)rom+pnp->product_offset; + } + + if (rom->license_offs) + license = (char *)rom+rom->license_offs; + + printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n", + manufacturer, product, + rom->api_version_maj, rom->api_version_min, + pci->rom_version_maj, pci->rom_version_min); + + license_gplok = license_is_gpl_compatible(license); + if (!license_gplok) { + printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... " + "inlining disabled\n", + license); + add_taint(TAINT_PROPRIETARY_MODULE); + } + return 1; +} + +/* + * Probe for the VMI option ROM + */ +static inline int __init probe_vmi_rom(void) +{ + unsigned long base; + + /* VMI ROM is in option ROM area, check signature */ + for (base = 0xC0000; base < 0xE0000; base += 2048) { + struct vrom_header *romstart; + romstart = (struct vrom_header *)isa_bus_to_virt(base); + if (check_vmi_rom(romstart)) { + vmi_rom = romstart; + return 1; + } + } + return 0; +} + +/* + * VMI setup common to all processors + */ +void vmi_bringup(void) +{ + /* We must establish the lowmem mapping for MMU ops to work */ + if (vmi_rom) + vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0); +} + +/* + * Return a pointer to the VMI function or a NOP stub + */ +static void *vmi_get_function(int vmicall) +{ + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall); + BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); + if (rel->type == VMI_RELOCATION_CALL_REL) + return (void *)rel->eip; + else + return (void *)vmi_nop; +} + +/* + * Helper macro for making the VMI paravirt-ops fill code readable. + * For unimplemented operations, fall back to default. + */ +#define para_fill(opname, vmicall) \ +do { \ + reloc = call_vrom_long_func(vmi_rom, get_reloc, \ + VMI_CALL_##vmicall); \ + if (rel->type != VMI_RELOCATION_NONE) { \ + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \ + paravirt_ops.opname = (void *)rel->eip; \ + } \ +} while (0) + +/* + * Activate the VMI interface and switch into paravirtualized mode + */ +static inline int __init activate_vmi(void) +{ + short kernel_cs; + u64 reloc; + const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + + if (call_vrom_func(vmi_rom, vmi_init) != 0) { + printk(KERN_ERR "VMI ROM failed to initialize!"); + return 0; + } + savesegment(cs, kernel_cs); + + paravirt_ops.paravirt_enabled = 1; + paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + + paravirt_ops.patch = vmi_patch; + paravirt_ops.name = "vmi"; + + /* + * Many of these operations are ABI compatible with VMI. + * This means we can fill in the paravirt-ops with direct + * pointers into the VMI ROM. If the calling convention for + * these operations changes, this code needs to be updated. + * + * Exceptions + * CPUID paravirt-op uses pointers, not the native ISA + * halt has no VMI equivalent; all VMI halts are "safe" + * no MSR support yet - just trap and emulate. VMI uses the + * same ABI as the native ISA, but Linux wants exceptions + * from bogus MSR read / write handled + * rdpmc is not yet used in Linux + */ + + /* CPUID is special, so very special */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.cpuid = (void *)rel->eip; + paravirt_ops.cpuid = vmi_cpuid; + } + + para_fill(clts, CLTS); + para_fill(get_debugreg, GetDR); + para_fill(set_debugreg, SetDR); + para_fill(read_cr0, GetCR0); + para_fill(read_cr2, GetCR2); + para_fill(read_cr3, GetCR3); + para_fill(read_cr4, GetCR4); + para_fill(write_cr0, SetCR0); + para_fill(write_cr2, SetCR2); + para_fill(write_cr3, SetCR3); + para_fill(write_cr4, SetCR4); + para_fill(save_fl, GetInterruptMask); + para_fill(restore_fl, SetInterruptMask); + para_fill(irq_disable, DisableInterrupts); + para_fill(irq_enable, EnableInterrupts); + /* irq_save_disable !!! sheer pain */ + patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK], + (char *)paravirt_ops.save_fl); + patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE], + (char *)paravirt_ops.irq_disable); +#ifndef CONFIG_NO_IDLE_HZ + para_fill(safe_halt, Halt); +#else + vmi_ops.halt = vmi_get_function(VMI_CALL_Halt); + paravirt_ops.safe_halt = vmi_safe_halt; +#endif + para_fill(wbinvd, WBINVD); + /* paravirt_ops.read_msr = vmi_rdmsr */ + /* paravirt_ops.write_msr = vmi_wrmsr */ + para_fill(read_tsc, RDTSC); + /* paravirt_ops.rdpmc = vmi_rdpmc */ + + /* TR interface doesn't pass TR value */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_tr = (void *)rel->eip; + paravirt_ops.load_tr_desc = vmi_set_tr; + } + + /* LDT is special, too */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops._set_ldt = (void *)rel->eip; + paravirt_ops.set_ldt = vmi_set_ldt; + } + + para_fill(load_gdt, SetGDT); + para_fill(load_idt, SetIDT); + para_fill(store_gdt, GetGDT); + para_fill(store_idt, GetIDT); + para_fill(store_tr, GetTR); + paravirt_ops.load_tls = vmi_load_tls; + para_fill(write_ldt_entry, WriteLDTEntry); + para_fill(write_gdt_entry, WriteGDTEntry); + para_fill(write_idt_entry, WriteIDTEntry); + reloc = call_vrom_long_func(vmi_rom, get_reloc, + VMI_CALL_UpdateKernelStack); + if (rel->type != VMI_RELOCATION_NONE) { + BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); + vmi_ops.set_kernel_stack = (void *)rel->eip; + paravirt_ops.load_esp0 = vmi_load_esp0; + } + + para_fill(set_iopl_mask, SetIOPLMask); + paravirt_ops.io_delay = (void *)vmi_nop; + if (!disable_nodelay) { + paravirt_ops.const_udelay = (void *)vmi_nop; + } + + para_fill(set_lazy_mode, SetLazyMode); + + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_ops.flush_tlb = (void *)rel->eip; + paravirt_ops.flush_tlb_user = vmi_flush_tlb_user; + paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel; + } + para_fill(flush_tlb_single, InvalPage); + + /* + * Until a standard flag format can be agreed on, we need to + * implement these as wrappers in Linux. Get the VMI ROM + * function pointers for the two backend calls. + */ +#ifdef CONFIG_X86_PAE + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong); +#else + vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE); + vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE); +#endif + vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); + vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); + vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); + + paravirt_ops.alloc_pt = vmi_allocate_pt; + paravirt_ops.alloc_pd = vmi_allocate_pd; + paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + paravirt_ops.release_pt = vmi_release_pt; + paravirt_ops.release_pd = vmi_release_pd; + paravirt_ops.set_pte = vmi_set_pte; + paravirt_ops.set_pte_at = vmi_set_pte_at; + paravirt_ops.set_pmd = vmi_set_pmd; + paravirt_ops.pte_update = vmi_update_pte; + paravirt_ops.pte_update_defer = vmi_update_pte_defer; +#ifdef CONFIG_X86_PAE + paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; + paravirt_ops.set_pte_present = vmi_set_pte_present; + paravirt_ops.set_pud = vmi_set_pud; + paravirt_ops.pte_clear = vmi_pte_clear; + paravirt_ops.pmd_clear = vmi_pmd_clear; +#endif + /* + * These MUST always be patched. Don't support indirect jumps + * through these operations, as the VMI interface may use either + * a jump or a call to get to these operations, depending on + * the backend. They are performance critical anyway, so requiring + * a patch is not a big problem. + */ + paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; + paravirt_ops.iret = (void *)0xbadbab0; + +#ifdef CONFIG_SMP + paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook; + vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead); + paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite); + paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite); +#endif + + /* + * Check for VMI timer functionality by probing for a cycle frequency method + */ + reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency); + if (rel->type != VMI_RELOCATION_NONE) { + vmi_timer_ops.get_cycle_frequency = (void *)rel->eip; + vmi_timer_ops.get_cycle_counter = + vmi_get_function(VMI_CALL_GetCycleCounter); + vmi_timer_ops.get_wallclock = + vmi_get_function(VMI_CALL_GetWallclockTime); + vmi_timer_ops.wallclock_updated = + vmi_get_function(VMI_CALL_WallclockUpdated); + vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); + vmi_timer_ops.cancel_alarm = + vmi_get_function(VMI_CALL_CancelAlarm); + paravirt_ops.time_init = vmi_time_init; + paravirt_ops.get_wallclock = vmi_get_wallclock; + paravirt_ops.set_wallclock = vmi_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm; + paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm; +#endif + custom_sched_clock = vmi_sched_clock; + } + + /* + * Alternative instruction rewriting doesn't happen soon enough + * to convert VMI_IRET to a call instead of a jump; so we have + * to do this before IRQs get reenabled. Fortunately, it is + * idempotent. + */ + apply_paravirt(__start_parainstructions, __stop_parainstructions); + + vmi_bringup(); + + return 1; +} + +#undef para_fill + +void __init vmi_init(void) +{ + unsigned long flags; + + if (!vmi_rom) + probe_vmi_rom(); + else + check_vmi_rom(vmi_rom); + + /* In case probing for or validating the ROM failed, basil */ + if (!vmi_rom) + return; + + reserve_top_address(-vmi_rom->virtual_top); + + local_irq_save(flags); + activate_vmi(); +#ifdef CONFIG_SMP + no_timer_check = 1; +#endif + local_irq_restore(flags & X86_EFLAGS_IF); +} + +static int __init parse_vmi(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "disable_nodelay")) + disable_nodelay = 1; + else if (!strcmp(arg, "disable_pge")) { + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + disable_pge = 1; + } else if (!strcmp(arg, "disable_pse")) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else if (!strcmp(arg, "disable_sep")) { + clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + disable_sep = 1; + } else if (!strcmp(arg, "disable_tsc")) { + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + disable_tsc = 1; + } else if (!strcmp(arg, "disable_mtrr")) { + clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + disable_mtrr = 1; + } + return 0; +} + +early_param("vmi", parse_vmi); diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c new file mode 100644 index 00000000000..2e2d8dbcbd6 --- /dev/null +++ b/arch/i386/kernel/vmitime.c @@ -0,0 +1,499 @@ +/* + * VMI paravirtual timer support routines. + * + * Copyright (C) 2005, VMware, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to dhecht@vmware.com + * + */ + +/* + * Portions of this code from arch/i386/kernel/timers/timer_tsc.c. + * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/jiffies.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/rcupdate.h> +#include <linux/clocksource.h> + +#include <asm/timer.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/div64.h> +#include <asm/timer.h> +#include <asm/desc.h> + +#include <asm/vmi.h> +#include <asm/vmi_time.h> + +#include <mach_timer.h> +#include <io_ports.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT +#else +#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0 +#endif + +/* Cached VMI operations */ +struct vmi_timer_ops vmi_timer_ops; + +#ifdef CONFIG_NO_IDLE_HZ + +/* /proc/sys/kernel/hz_timer state. */ +int sysctl_hz_timer; + +/* Some stats */ +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs); +static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies); +static DEFINE_PER_CPU(unsigned long, idle_start_jiffies); + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */ +static int alarm_hz = CONFIG_VMI_ALARM_HZ; + +/* Cache of the value get_cycle_frequency / HZ. */ +static signed long long cycles_per_jiffy; + +/* Cache of the value get_cycle_frequency / alarm_hz. */ +static signed long long cycles_per_alarm; + +/* The number of cycles accounted for by the 'jiffies'/'xtime' count. + * Protected by xtime_lock. */ +static unsigned long long real_cycles_accounted_system; + +/* The number of cycles accounted for by update_process_times(), per cpu. */ +static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu); + +/* The number of stolen cycles accounted, per cpu. */ +static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu); + +/* Clock source. */ +static cycle_t read_real_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); +} + +static cycle_t read_available_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); +} + +#if 0 +static cycle_t read_stolen_cycles(void) +{ + return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN); +} +#endif /* 0 */ + +static struct clocksource clocksource_vmi = { + .name = "vmi-timer", + .rating = 450, + .read = read_real_cycles, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .is_continuous = 1, +}; + + +/* Timer interrupt handler. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id); + +static struct irqaction vmi_timer_irq = { + vmi_timer_interrupt, + SA_INTERRUPT, + CPU_MASK_NONE, + "VMI-alarm", + NULL, + NULL +}; + +/* Alarm rate */ +static int __init vmi_timer_alarm_rate_setup(char* str) +{ + int alarm_rate; + if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) { + alarm_hz = alarm_rate; + printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz); + } + return 1; +} +__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup); + + +/* Initialization */ +static void vmi_get_wallclock_ts(struct timespec *ts) +{ + unsigned long long wallclock; + wallclock = vmi_timer_ops.get_wallclock(); // nsec units + ts->tv_nsec = do_div(wallclock, 1000000000); + ts->tv_sec = wallclock; +} + +static void update_xtime_from_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + do_settimeofday(&ts); +} + +unsigned long vmi_get_wallclock(void) +{ + struct timespec ts; + vmi_get_wallclock_ts(&ts); + return ts.tv_sec; +} + +int vmi_set_wallclock(unsigned long now) +{ + return -1; +} + +unsigned long long vmi_sched_clock(void) +{ + return read_available_cycles(); +} + +void __init vmi_time_init(void) +{ + unsigned long long cycles_per_sec, cycles_per_msec; + unsigned long flags; + + local_irq_save(flags); + setup_irq(0, &vmi_timer_irq); +#ifdef CONFIG_X86_LOCAL_APIC + set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt); +#endif + + no_sync_cmos_clock = 1; + + vmi_get_wallclock_ts(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + real_cycles_accounted_system = read_real_cycles(); + update_xtime_from_wallclock(); + per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles(); + + cycles_per_sec = vmi_timer_ops.get_cycle_frequency(); + + cycles_per_jiffy = cycles_per_sec; + (void)do_div(cycles_per_jiffy, HZ); + cycles_per_alarm = cycles_per_sec; + (void)do_div(cycles_per_alarm, alarm_hz); + cycles_per_msec = cycles_per_sec; + (void)do_div(cycles_per_msec, 1000); + cpu_khz = cycles_per_msec; + + printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;" + "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy, + cycles_per_alarm); + + clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec, + clocksource_vmi.shift); + if (clocksource_register(&clocksource_vmi)) + printk(KERN_WARNING "Error registering VMITIME clocksource."); + + /* Disable PIT. */ + outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + + /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu + * reduce the latency calling update_process_times. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); + + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_LOCAL_APIC + +void __init vmi_timer_setup_boot_alarm(void) +{ + local_irq_disable(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */ + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm, + cycles_per_alarm); + local_irq_enable(); +} + +/* Initialize the time accounting variables for an AP on an SMP system. + * Also, set the local alarm for the AP. */ +void __init vmi_timer_setup_secondary_alarm(void) +{ + int cpu = smp_processor_id(); + + /* Route the interrupt to the correct vector. */ + apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR); + + per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles(); + + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); +} + +#endif + +/* Update system wide (real) time accounting (e.g. jiffies, xtime). */ +static void vmi_account_real_cycles(unsigned long long cur_real_cycles) +{ + long long cycles_not_accounted; + + write_seqlock(&xtime_lock); + + cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system; + while (cycles_not_accounted >= cycles_per_jiffy) { + /* systems wide jiffies and wallclock. */ + do_timer(1); + + cycles_not_accounted -= cycles_per_jiffy; + real_cycles_accounted_system += cycles_per_jiffy; + } + + if (vmi_timer_ops.wallclock_updated()) + update_xtime_from_wallclock(); + + write_sequnlock(&xtime_lock); +} + +/* Update per-cpu process times. */ +static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + /* Account time to the current process. This includes + * calling into the scheduler to decrement the timeslice + * and possibly reschedule.*/ + update_process_times(user_mode(regs)); + /* XXX handle /proc/profile multiplier. */ + profile_tick(CPU_PROFILING); + + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } +} + +#ifdef CONFIG_NO_IDLE_HZ +/* Update per-cpu idle times. Used when a no-hz halt is ended. */ +static void vmi_account_no_hz_idle_cycles(int cpu, + unsigned long long cur_process_times_cycles) +{ + long long cycles_not_accounted; + unsigned long no_idle_hz_jiffies = 0; + + cycles_not_accounted = cur_process_times_cycles - + per_cpu(process_times_cycles_accounted_cpu, cpu); + + while (cycles_not_accounted >= cycles_per_jiffy) { + no_idle_hz_jiffies++; + cycles_not_accounted -= cycles_per_jiffy; + per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* Account time to the idle process. */ + account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies)); +} +#endif + +/* Update per-cpu stolen time. */ +static void vmi_account_stolen_cycles(int cpu, + unsigned long long cur_real_cycles, + unsigned long long cur_avail_cycles) +{ + long long stolen_cycles_not_accounted; + unsigned long stolen_jiffies = 0; + + if (cur_real_cycles < cur_avail_cycles) + return; + + stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles - + per_cpu(stolen_cycles_accounted_cpu, cpu); + + while (stolen_cycles_not_accounted >= cycles_per_jiffy) { + stolen_jiffies++; + stolen_cycles_not_accounted -= cycles_per_jiffy; + per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy; + } + /* HACK: pass NULL to force time onto cpustat->steal. */ + account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies)); +} + +/* Body of either IRQ0 interrupt handler (UP no local-APIC) or + * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */ +static void vmi_local_timer_interrupt(int cpu) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu process times. */ + vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); +} + +#ifdef CONFIG_NO_IDLE_HZ + +/* Must be called only from idle loop, with interrupts disabled. */ +int vmi_stop_hz_timer(void) +{ + /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */ + + unsigned long seq, next; + unsigned long long real_cycles_expiry; + int cpu = smp_processor_id(); + int idle; + + BUG_ON(!irqs_disabled()); + if (sysctl_hz_timer != 0) + return 0; + + cpu_set(cpu, nohz_cpu_mask); + smp_mb(); + if (rcu_needs_cpu(cpu) || local_softirq_pending() || + (next = next_timer_interrupt(), time_before_eq(next, jiffies))) { + cpu_clear(cpu, nohz_cpu_mask); + next = jiffies; + idle = 0; + } else + idle = 1; + + /* Convert jiffies to the real cycle counter. */ + do { + seq = read_seqbegin(&xtime_lock); + real_cycles_expiry = real_cycles_accounted_system + + (long)(next - jiffies) * cycles_per_jiffy; + } while (read_seqretry(&xtime_lock, seq)); + + /* This cpu is going idle. Disable the periodic alarm. */ + if (idle) { + vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE); + per_cpu(idle_start_jiffies, cpu) = jiffies; + } + + /* Set the real time alarm to expire at the next event. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL, + real_cycles_expiry, 0); + + return idle; +} + +static void vmi_reenable_hz_timer(int cpu) +{ + /* For /proc/vmi/info idle_hz stat. */ + per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu); + per_cpu(vmi_idle_no_hz_irqs, cpu)++; + + /* Don't bother explicitly cancelling the one-shot alarm -- at + * worse we will receive a spurious timer interrupt. */ + vmi_timer_ops.set_alarm( + VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE, + per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm, + cycles_per_alarm); + /* Indicate this cpu is no longer nohz idle. */ + cpu_clear(cpu, nohz_cpu_mask); +} + +/* Called from interrupt handlers when (local) HZ timer is disabled. */ +void vmi_account_time_restart_hz_timer(void) +{ + unsigned long long cur_real_cycles, cur_process_times_cycles; + int cpu = smp_processor_id(); + + BUG_ON(!irqs_disabled()); + /* Account the time during which the HZ timer was disabled. */ + cur_real_cycles = read_real_cycles(); + cur_process_times_cycles = read_available_cycles(); + /* Update system wide (real) time state (xtime, jiffies). */ + vmi_account_real_cycles(cur_real_cycles); + /* Update per-cpu idle times. */ + vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles); + /* Update time stolen from this cpu by the hypervisor. */ + vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles); + /* Reenable the hz timer. */ + vmi_reenable_hz_timer(cpu); +} + +#endif /* CONFIG_NO_IDLE_HZ */ + +/* UP (and no local-APIC) VMI-timer alarm interrupt handler. + * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after + * APIC setup and setup_boot_vmi_alarm() is called. */ +static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) +{ + vmi_local_timer_interrupt(smp_processor_id()); + return IRQ_HANDLED; +} + +#ifdef CONFIG_X86_LOCAL_APIC + +/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector. + * Also used in UP when CONFIG_X86_LOCAL_APIC. + * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */ +void smp_apic_vmi_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat,cpu).apic_timer_irqs++; + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + vmi_local_timer_interrupt(cpu); + irq_exit(); + set_irq_regs(old_regs); +} + +#endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 5038a73d554..ca51610955d 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -37,9 +37,14 @@ SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; + + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ + *(.text.head) + } :text = 0x9090 + /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ *(.text) SCHED_TEXT LOCK_TEXT diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c index 9819b705efa..2e2c51a8bd3 100644 --- a/arch/i386/math-emu/get_address.c +++ b/arch/i386/math-emu/get_address.c @@ -56,15 +56,14 @@ static int reg_offset_vm86[] = { #define VM86_REG_(x) (*(unsigned short *) \ (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) -/* These are dummy, fs and gs are not saved on the stack. */ -#define ___FS ___ds +/* This dummy, gs is not saved on the stack. */ #define ___GS ___ds static int reg_offset_pm[] = { offsetof(struct info,___cs), offsetof(struct info,___ds), offsetof(struct info,___es), - offsetof(struct info,___FS), + offsetof(struct info,___fs), offsetof(struct info,___GS), offsetof(struct info,___ss), offsetof(struct info,___ds) @@ -169,13 +168,10 @@ static long pm_address(u_char FPU_modrm, u_char segment, switch ( segment ) { - /* fs and gs aren't used by the kernel, so they still have their - user-space values. */ - case PREFIX_FS_-1: - /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ - savesegment(fs, addr->selector); - break; + /* gs isn't used by the kernel, so it still has its + user-space value. */ case PREFIX_GS_-1: + /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ savesegment(gs, addr->selector); break; default: diff --git a/arch/i386/math-emu/status_w.h b/arch/i386/math-emu/status_w.h index 78d7b7689dd..59e73302aa6 100644 --- a/arch/i386/math-emu/status_w.h +++ b/arch/i386/math-emu/status_w.h @@ -48,9 +48,11 @@ #define status_word() \ ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) -#define setcc(cc) ({ \ - partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); \ - partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); }) +static inline void setcc(int cc) +{ + partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); + partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); +} #ifdef PECULIAR_486 /* Default, this conveys no information, but an 80486 does it. */ diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index e0c390d6ceb..aa58720f687 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -101,7 +101,6 @@ extern void find_max_pfn(void); extern void add_one_highpage_init(struct page *, int, int); extern struct e820map e820; -extern unsigned long init_pg_tables_end; extern unsigned long highend_pfn, highstart_pfn; extern unsigned long max_low_pfn; extern unsigned long totalram_pages; diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index cba9b3894a3..b8c4e259fc8 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -46,17 +46,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* @@ -327,8 +327,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) return; - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -337,8 +336,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* It's safe to allow irq's after cr2 has been saved and the vmalloc diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index c5c5ea700cc..ae436882af7 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -62,6 +62,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); if (pmd_table != pmd_offset(pud, 0)) @@ -82,6 +83,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) { if (pmd_none(*pmd)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -345,6 +347,8 @@ static void __init pagetable_init (void) /* Init entries of the first-level page table to the zero page */ for (i = 0; i < PTRS_PER_PGD; i++) set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#else + paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); #endif /* Enable PSE if available */ diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index e223b1d4981..412ebbd8adb 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,6 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); + paravirt_alloc_pt(page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); @@ -172,6 +173,7 @@ __change_page_attr(struct page *page, pgprot_t prot) if (!PageReserved(kpte_page)) { if (cpu_has_pse && (page_private(kpte_page) == 0)) { ClearPagePrivate(kpte_page); + paravirt_release_pt(page_to_pfn(kpte_page)); list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index f349eaf450b..fa0cfbd551e 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -171,6 +171,8 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) void reserve_top_address(unsigned long reserve) { BUG_ON(fixmaps > 0); + printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", + (int)-reserve); #ifdef CONFIG_COMPAT_VDSO BUG_ON(reserve != 0); #else @@ -248,9 +250,15 @@ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); + if (PTRS_PER_PMD > 1) return; + /* must happen under lock */ + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); + pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } @@ -260,6 +268,7 @@ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -277,13 +286,18 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } return pgd; out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + for (i--; i >= 0; i--) { + pgd_t pgdent = pgd[i]; + void* pmd = (void *)__va(pgd_val(pgdent)-1); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + kmem_cache_free(pmd_cache, pmd); + } kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -294,8 +308,12 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pgd_t pgdent = pgd[i]; + void* pmd = (void *)__va(pgd_val(pgdent)-1); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); + kmem_cache_free(pmd_cache, pmd); + } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c index ca2447e05e1..c554f52cb80 100644 --- a/arch/i386/oprofile/op_model_ppro.c +++ b/arch/i386/oprofile/op_model_ppro.c @@ -24,7 +24,8 @@ #define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0) -#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), -1);} while (0) +#define CTR_32BIT_WRITE(l,msrs,c) \ + do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) #define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) @@ -79,7 +80,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs,i))) continue; - CTR_WRITE(1, msrs, i); + CTR_32BIT_WRITE(1, msrs, i); } /* enable active counters */ @@ -87,7 +88,7 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) { reset_value[i] = counter_config[i].count; - CTR_WRITE(counter_config[i].count, msrs, i); + CTR_32BIT_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -116,7 +117,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + CTR_32BIT_WRITE(reset_value[i], msrs, i); } } diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile index 1594d2f55c8..44650e03308 100644 --- a/arch/i386/pci/Makefile +++ b/arch/i386/pci/Makefile @@ -1,7 +1,7 @@ obj-y := i386.o init.o obj-$(CONFIG_PCI_BIOS) += pcbios.o -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o pci-y := fixup.o diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c new file mode 100644 index 00000000000..747d8c63b0c --- /dev/null +++ b/arch/i386/pci/mmconfig-shared.c @@ -0,0 +1,264 @@ +/* + * mmconfig-shared.c - Low-level direct PCI config space access via + * MMCONFIG - common code between i386 and x86-64. + * + * This code does: + * - known chipset handling + * - ACPI decoding and validation + * + * Per-architecture code takes care of the mappings and accesses + * themselves. + */ + +#include <linux/pci.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/bitmap.h> +#include <asm/e820.h> + +#include "pci.h" + +/* aperture is up to 256MB but BIOS may reserve less */ +#define MMCONFIG_APER_MIN (2 * 1024*1024) +#define MMCONFIG_APER_MAX (256 * 1024*1024) + +DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); + +/* K8 systems have some devices (typically in the builtin northbridge) + that are only accessible using type1 + Normally this can be expressed in the MCFG by not listing them + and assigning suitable _SEGs, but this isn't implemented in some BIOS. + Instead try to discover all devices on bus 0 that are unreachable using MM + and fallback for them. */ +static void __init unreachable_devices(void) +{ + int i, bus; + /* Use the max bus number from ACPI here? */ + for (bus = 0; bus < PCI_MMCFG_MAX_CHECK_BUS; bus++) { + for (i = 0; i < 32; i++) { + unsigned int devfn = PCI_DEVFN(i, 0); + u32 val1, val2; + + pci_conf1_read(0, bus, devfn, 0, 4, &val1); + if (val1 == 0xffffffff) + continue; + + if (pci_mmcfg_arch_reachable(0, bus, devfn)) { + raw_pci_ops->read(0, bus, devfn, 0, 4, &val2); + if (val1 == val2) + continue; + } + set_bit(i + 32 * bus, pci_mmcfg_fallback_slots); + printk(KERN_NOTICE "PCI: No mmconfig possible on device" + " %02x:%02x\n", bus, i); + } + } +} + +static const char __init *pci_mmcfg_e7520(void) +{ + u32 win; + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0xce, 2, &win); + + pci_mmcfg_config_num = 1; + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = (win & 0xf000) << 16; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = 255; + + return "Intel Corporation E7520 Memory Controller Hub"; +} + +static const char __init *pci_mmcfg_intel_945(void) +{ + u32 pciexbar, mask = 0, len = 0; + + pci_mmcfg_config_num = 1; + + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0x48, 4, &pciexbar); + + /* Enable bit */ + if (!(pciexbar & 1)) + pci_mmcfg_config_num = 0; + + /* Size bits */ + switch ((pciexbar >> 1) & 3) { + case 0: + mask = 0xf0000000U; + len = 0x10000000U; + break; + case 1: + mask = 0xf8000000U; + len = 0x08000000U; + break; + case 2: + mask = 0xfc000000U; + len = 0x04000000U; + break; + default: + pci_mmcfg_config_num = 0; + } + + /* Errata #2, things break when not aligned on a 256Mb boundary */ + /* Can only happen in 64M/128M mode */ + + if ((pciexbar & mask) & 0x0fffffffU) + pci_mmcfg_config_num = 0; + + if (pci_mmcfg_config_num) { + pci_mmcfg_config = kzalloc(sizeof(pci_mmcfg_config[0]), GFP_KERNEL); + if (!pci_mmcfg_config) + return NULL; + pci_mmcfg_config[0].address = pciexbar & mask; + pci_mmcfg_config[0].pci_segment = 0; + pci_mmcfg_config[0].start_bus_number = 0; + pci_mmcfg_config[0].end_bus_number = (len >> 20) - 1; + } + + return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; +} + +struct pci_mmcfg_hostbridge_probe { + u32 vendor; + u32 device; + const char *(*probe)(void); +}; + +static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = { + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 }, + { PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82945G_HB, pci_mmcfg_intel_945 }, +}; + +static int __init pci_mmcfg_check_hostbridge(void) +{ + u32 l; + u16 vendor, device; + int i; + const char *name; + + pci_conf1_read(0, 0, PCI_DEVFN(0,0), 0, 4, &l); + vendor = l & 0xffff; + device = (l >> 16) & 0xffff; + + pci_mmcfg_config_num = 0; + pci_mmcfg_config = NULL; + name = NULL; + + for (i = 0; !name && i < ARRAY_SIZE(pci_mmcfg_probes); i++) { + if (pci_mmcfg_probes[i].vendor == vendor && + pci_mmcfg_probes[i].device == device) + name = pci_mmcfg_probes[i].probe(); + } + + if (name) { + printk(KERN_INFO "PCI: Found %s %s MMCONFIG support.\n", + name, pci_mmcfg_config_num ? "with" : "without"); + } + + return name != NULL; +} + +static void __init pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i]; + num_buses = cfg->end_bus_number - cfg->start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + cfg->pci_segment); + res->start = cfg->address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + +static void __init pci_mmcfg_reject_broken(int type) +{ + typeof(pci_mmcfg_config[0]) *cfg; + + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return; + + cfg = &pci_mmcfg_config[0]; + + /* + * Handle more broken MCFG tables on Asus etc. + * They only contain a single entry for bus 0-0. + */ + if (pci_mmcfg_config_num == 1 && + cfg->pci_segment == 0 && + (cfg->start_bus_number | cfg->end_bus_number) == 0) { + printk(KERN_ERR "PCI: start and end of bus number is 0. " + "Rejected as broken MCFG.\n"); + goto reject; + } + + /* + * Only do this check when type 1 works. If it doesn't work + * assume we run on a Mac and always use MCFG + */ + if (type == 1 && !e820_all_mapped(cfg->address, + cfg->address + MMCONFIG_APER_MIN, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" + " E820-reserved\n", cfg->address); + goto reject; + } + return; + +reject: + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + kfree(pci_mmcfg_config); + pci_mmcfg_config = NULL; + pci_mmcfg_config_num = 0; +} + +void __init pci_mmcfg_init(int type) +{ + int known_bridge = 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return; + + if (type == 1 && pci_mmcfg_check_hostbridge()) + known_bridge = 1; + + if (!known_bridge) { + acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); + pci_mmcfg_reject_broken(type); + } + + if ((pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return; + + if (pci_mmcfg_arch_init()) { + if (type == 1) + unreachable_devices(); + if (known_bridge) + pci_mmcfg_insert_resources(); + pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + } +} diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 5700220dcf5..bb1afd9e589 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -15,55 +15,33 @@ #include <asm/e820.h> #include "pci.h" -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - /* Assume systems with more busses have correct MCFG */ -#define MAX_CHECK_BUS 16 - #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) /* The base address of the last MMCONFIG device accessed */ static u32 mmcfg_last_accessed_device; static int mmcfg_last_accessed_cpu; -static DECLARE_BITMAP(fallback_slots, MAX_CHECK_BUS*32); - /* * Functions for accessing PCI configuration space with MMCONFIG accesses */ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) { - int cfg_num = -1; struct acpi_mcfg_allocation *cfg; + int cfg_num; - if (seg == 0 && bus < MAX_CHECK_BUS && - test_bit(PCI_SLOT(devfn) + 32*bus, fallback_slots)) + if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && + test_bit(PCI_SLOT(devfn) + 32*bus, pci_mmcfg_fallback_slots)) return 0; - while (1) { - ++cfg_num; - if (cfg_num >= pci_mmcfg_config_num) { - break; - } + for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = &pci_mmcfg_config[cfg_num]; - if (cfg->pci_segment != seg) - continue; - if ((cfg->start_bus_number <= bus) && + if (cfg->pci_segment == seg && + (cfg->start_bus_number <= bus) && (cfg->end_bus_number >= bus)) return cfg->address; } - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - cfg = &pci_mmcfg_config[0]; - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) - return cfg->address; - /* Fall back to type 0 */ return 0; } @@ -158,67 +136,15 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static __init void unreachable_devices(void) +int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn) { - int i, k; - unsigned long flags; - - for (k = 0; k < MAX_CHECK_BUS; k++) { - for (i = 0; i < 32; i++) { - u32 val1; - u32 addr; - - pci_conf1_read(0, k, PCI_DEVFN(i, 0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - - /* Locking probably not needed, but safer */ - spin_lock_irqsave(&pci_config_lock, flags); - addr = get_base_addr(0, k, PCI_DEVFN(i, 0)); - if (addr != 0) - pci_exp_set_dev_base(addr, k, PCI_DEVFN(i, 0)); - if (addr == 0 || - readl((u32 __iomem *)mmcfg_virt_addr) != val1) { - set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE - "PCI: No mmconfig possible on %x:%x\n", k, i); - } - spin_unlock_irqrestore(&pci_config_lock, flags); - } - } + return get_base_addr(seg, bus, devfn) != 0; } -void __init pci_mmcfg_init(int type) +int __init pci_mmcfg_arch_init(void) { - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n", - (unsigned long)pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } - printk(KERN_INFO "PCI: Using MMCONFIG\n"); raw_pci_ops = &pci_mmcfg; - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; - - unreachable_devices(); + return 1; } diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index a0a25180b61..e58bae2076a 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -94,3 +94,13 @@ extern void pci_pcbios_init(void); extern void pci_mmcfg_init(int type); extern void pcibios_sort(void); +/* pci-mmconfig.c */ + +/* Verify the first 16 busses. We assume that systems with more busses + get MCFG right. */ +#define PCI_MMCFG_MAX_CHECK_BUS 16 +extern DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); + +extern int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn); +extern int __init pci_mmcfg_arch_init(void); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 02dd39457bc..7982cbc3bc9 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -152,18 +152,18 @@ config MPSC Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs with Intel Extended Memory 64 Technology(EM64T). For details see <http://www.intel.com/technology/64bitextensions/>. - Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distingush them + Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the + Netburst core and shouldn't use this option. You can distinguish them using the cpu family field - in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one - (this rule only applies to system that support EM64T) + in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one + (this rule only applies to systems that support EM64T) config MCORE2 bool "Intel Core2 / newer Xeon" help Optimize for Intel Core2 and newer Xeons (51xx) - You can distingush the newer Xeons from the older ones using - the cpu family field in /proc/cpuinfo. 15 is a older Xeon + You can distinguish the newer Xeons from the older ones using + the cpu family field in /proc/cpuinfo. 15 is an older Xeon (use CONFIG_MPSC then), 6 is a newer one. This rule only applies to CPUs that support EM64T. @@ -458,8 +458,8 @@ config IOMMU on systems with more than 3GB. This is usually needed for USB, sound, many IDE/SATA chipsets and some other devices. Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART - based IOMMU and a software bounce buffer based IOMMU used on Intel - systems and as fallback. + based hardware IOMMU and a software bounce buffer based IOMMU used + on Intel systems and as fallback. The code is only active when needed (enough memory and limited device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified too. @@ -496,6 +496,12 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool + help + Support for software bounce buffers used on x86-64 systems + which don't have a hardware IOMMU (e.g. the current generation + of Intel's x86-64 CPUs). Using this PCI devices which can only + access 32-bits of memory can be used on systems with more than + 3 GB of memory. If unsure, say Y. config X86_MCE bool "Machine check support" if EMBEDDED diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69584c29530..293a4a4c609 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-rc3 -# Fri Jan 5 11:54:41 2007 +# Linux kernel version: 2.6.20-git8 +# Tue Feb 13 11:25:16 2007 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -11,6 +11,7 @@ CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y +CONFIG_ZONE_DMA=y CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y @@ -153,6 +154,7 @@ CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_RESOURCES_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 @@ -201,13 +203,14 @@ CONFIG_ACPI=y CONFIG_ACPI_SLEEP=y CONFIG_ACPI_SLEEP_PROC_FS=y CONFIG_ACPI_SLEEP_PROC_SLEEP=y +CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set +# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y @@ -263,7 +266,6 @@ CONFIG_PCI_MMCONFIG=y CONFIG_PCIEPORTBUS=y CONFIG_PCIEAER=y CONFIG_PCI_MSI=y -# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # CONFIG_HT_IRQ is not set @@ -398,6 +400,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set # @@ -466,6 +469,7 @@ CONFIG_BLK_DEV_IDECD=y # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_TASK_IOCTL is not set # @@ -497,6 +501,7 @@ CONFIG_BLK_DEV_ATIIXP=y # CONFIG_BLK_DEV_JMICRON is not set # CONFIG_BLK_DEV_SC1200 is not set CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_IT8213 is not set # CONFIG_BLK_DEV_IT821X is not set # CONFIG_BLK_DEV_NS87415 is not set # CONFIG_BLK_DEV_PDC202XX_OLD is not set @@ -507,6 +512,7 @@ CONFIG_BLK_DEV_PDC202XX_NEW=y # CONFIG_BLK_DEV_SLC90E66 is not set # CONFIG_BLK_DEV_TRM290 is not set # CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_BLK_DEV_TC86C001 is not set # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -599,6 +605,7 @@ CONFIG_MEGARAID_SAS=y # Serial ATA (prod) and Parallel ATA (experimental) drivers # CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set CONFIG_SATA_AHCI=y CONFIG_SATA_SVW=y CONFIG_ATA_PIIX=y @@ -614,6 +621,7 @@ CONFIG_SATA_SIL=y # CONFIG_SATA_ULI is not set CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set @@ -630,6 +638,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_HPT3X2N is not set # CONFIG_PATA_HPT3X3 is not set # CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set @@ -682,9 +691,7 @@ CONFIG_IEEE1394=y # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set # CONFIG_IEEE1394_EXTRA_CONFIG_ROMS is not set -# CONFIG_IEEE1394_EXPORT_FULL_API is not set # # Device Drivers @@ -707,6 +714,11 @@ CONFIG_IEEE1394_RAWIO=y # CONFIG_I2O is not set # +# Macintosh device drivers +# +# CONFIG_MAC_EMUMOUSEBTN is not set + +# # Network device support # CONFIG_NETDEVICES=y @@ -774,6 +786,7 @@ CONFIG_8139TOO=y # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set # CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set # # Ethernet (1000 Mbit) @@ -795,11 +808,13 @@ CONFIG_E1000=y CONFIG_TIGON3=y CONFIG_BNX2=y # CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set # # Ethernet (10000 Mbit) # # CONFIG_CHELSIO_T1 is not set +# CONFIG_CHELSIO_T3 is not set # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set @@ -1115,6 +1130,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OBSOLETE_OSS=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y @@ -1128,6 +1144,7 @@ CONFIG_SOUND_ICH=y # HID Devices # CONFIG_HID=y +# CONFIG_HID_DEBUG is not set # # USB support @@ -1142,10 +1159,8 @@ CONFIG_USB=y # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set -# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1155,9 +1170,11 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set +# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y -# CONFIG_USB_OHCI_BIG_ENDIAN is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set +# CONFIG_USB_OHCI_BIG_ENDIAN_MMIO is not set CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -1208,6 +1225,7 @@ CONFIG_USB_HID=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set +# CONFIG_USB_GTCO is not set # # USB Imaging devices @@ -1313,6 +1331,10 @@ CONFIG_USB_MON=y # # +# Auxiliary Display support +# + +# # Virtualization # # CONFIG_KVM is not set @@ -1512,6 +1534,7 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_FS=y # CONFIG_HEADERS_CHECK is not set CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set @@ -1520,7 +1543,6 @@ CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set @@ -1560,4 +1582,5 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y -CONFIG_IOMAP_COPY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index ff499ef2a1b..359eacc3850 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -21,6 +21,7 @@ #include <linux/stddef.h> #include <linux/personality.h> #include <linux/compat.h> +#include <linux/binfmts.h> #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> @@ -449,7 +450,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, /* Return stub is in 32bit vsyscall page */ { - void __user *restorer = VSYSCALL32_SIGRETURN; + void __user *restorer; + if (current->binfmt->hasvdso) + restorer = VSYSCALL32_SIGRETURN; + else + restorer = (void *)&frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); @@ -495,7 +500,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -601,7 +606,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5f32cf4de5f..eda7a0d4dc1 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -718,4 +718,5 @@ ia32_sys_call_table: .quad compat_sys_vmsplice .quad compat_sys_move_pages .quad sys_getcpu + .quad sys_epoll_pwait ia32_syscall_end: diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 3c7cbff04d3..ae399458024 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -56,3 +57,4 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o +pcspeaker-y += ../../i386/kernel/pcspeaker.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 5ebf62c7a3d..23178ce6c78 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -58,7 +58,7 @@ unsigned long acpi_wakeup_address = 0; unsigned long acpi_video_flags; extern char wakeup_start, wakeup_end; -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); +extern unsigned long acpi_copy_wakeup_routine(unsigned long); static pgd_t low_ptr; diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 6fe191c5808..4651fd22b21 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } +#ifdef CONFIG_NUMA + /* NUMA memory to node map */ + if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { + *addrp = nodemap_addr + nodemap_size; + return 1; + } +#endif /* XXX ramdisk image here? */ return 0; } @@ -184,6 +191,37 @@ unsigned long __init e820_end_of_ram(void) } /* + * Find the hole size in the range. + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr >= end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr) + ram += last - addr; + } + return ((end - start) - ram); +} + +/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 1e6f8087067..598a4d0351f 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -163,6 +163,20 @@ startup_64: */ lgdt cpu_gdt_descr + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + /* * Setup up a dummy PDA. this is just for some early bootup code * that does in_interrupt() @@ -173,12 +187,6 @@ startup_64: shrq $32,%rdx wrmsr - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - movl %eax,%ss - movl %eax,%es - /* esi is pointer to real mode structure with interesting info. pass it to C */ movl %esi, %edi diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 6be6730acb5..566e64d966c 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -831,7 +831,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); @@ -839,7 +839,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (irq_trigger(idx)) { entry.trigger = 1; entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); } if (!apic && !IO_APIC_IRQ(irq)) @@ -851,7 +851,7 @@ static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) if (vector < 0) return; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.vector = vector; ioapic_register_intr(irq, vector, IOAPIC_AUTO); @@ -920,7 +920,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in */ entry.dest_mode = INT_DEST_MODE; entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1020,18 +1020,17 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", + printk(KERN_DEBUG " %02x %03X ", i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest + entry.dest ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", @@ -1293,8 +1292,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(apic_read(APIC_ID)); + entry.dest = GET_APIC_ID(apic_read(APIC_ID)); /* * Add it to the IO-APIC irq-routing table: @@ -1556,7 +1554,7 @@ static inline void unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; @@ -2131,7 +2129,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index fe063d3cfe4..745b1f0f494 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -114,6 +114,6 @@ asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); return 0; } diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 0c06af6c13b..3bc30d2c13d 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -18,6 +18,7 @@ #include <asm/uaccess.h> #include <asm/io_apic.h> #include <asm/idle.h> +#include <asm/smp.h> atomic_t irq_err_count; @@ -120,9 +121,14 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) if (likely(irq < NR_IRQS)) generic_handle_irq(irq); - else if (printk_ratelimit()) - printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", - __func__, smp_processor_id(), vector); + else { + if (!disable_apic) + ack_APIC_irq(); + + if (printk_ratelimit()) + printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", + __func__, smp_processor_id(), vector); + } irq_exit(); diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bdb54a2c9f1..8011a8e1c7d 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -19,6 +19,7 @@ #include <linux/cpu.h> #include <linux/percpu.h> #include <linux/ctype.h> +#include <linux/kmod.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -42,6 +43,10 @@ static unsigned long console_logged; static int notify_user; static int rip_msr; static int mce_bootlog = 1; +static atomic_t mce_events; + +static char trigger[128]; +static char *trigger_argv[2] = { trigger, NULL }; /* * Lockless MCE logging infrastructure. @@ -57,6 +62,7 @@ struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + atomic_inc(&mce_events); mce->finished = 0; wmb(); for (;;) { @@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } +static void do_mce_trigger(void) +{ + static atomic_t mce_logged; + int events = atomic_read(&mce_events); + if (events != atomic_read(&mce_logged) && trigger[0]) { + /* Small race window, but should be harmless. */ + atomic_set(&mce_logged, events); + call_usermodehelper(trigger, trigger_argv, NULL, -1); + } +} + /* * The actual machine check handler */ @@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) + if (!regs) { + /* Normal interrupt context here. Call trigger for any new + events. */ + do_mce_trigger(); goto out; + } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ @@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce); } \ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); +/* TBD should generate these dynamically based on number of available banks */ ACCESSOR(bank0ctl,bank[0],mce_restart()) ACCESSOR(bank1ctl,bank[1],mce_restart()) ACCESSOR(bank2ctl,bank[2],mce_restart()) ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static struct sysdev_attribute * bank_attributes[NR_BANKS] = { - &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, - &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; + +static ssize_t show_trigger(struct sys_device *s, char *buf) +{ + strcpy(buf, trigger); + strcat(buf, "\n"); + return strlen(trigger) + 1; +} + +static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +{ + char *p; + int len; + strncpy(trigger, buf, sizeof(trigger)); + trigger[sizeof(trigger)-1] = 0; + len = strlen(trigger); + p = strchr(trigger, '\n'); + if (*p) *p = 0; + return len; +} + +static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); ACCESSOR(tolerant,tolerant,) ACCESSOR(check_interval,check_interval,mce_restart()) +static struct sysdev_attribute *mce_attributes[] = { + &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, + &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, + &attr_tolerant, &attr_check_interval, &attr_trigger, + NULL +}; /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) err = sysdev_register(&per_cpu(device_mce,cpu)); if (!err) { - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_create_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); } return err; } @@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu) { int i; - for (i = 0; i < banks; i++) + for (i = 0; mce_attributes[i]; i++) sysdev_remove_file(&per_cpu(device_mce,cpu), - bank_attributes[i]); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); - sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); + mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 93c70725763..d0bd5d66e10 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -37,6 +37,8 @@ #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 #define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 #define MASK_LVTOFF_HI 0x00F00000 #define MASK_COUNT_EN_HI 0x00080000 #define MASK_INT_TYPE_HI 0x00060000 @@ -122,14 +124,17 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -138,8 +143,8 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; if (!block) @@ -187,17 +192,22 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; - else if (block == 1) - address = MCG_XBLK_ADDR - + ((low & MASK_BLKPTR_LO) >> 21); + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else ++address; if (rdmsr_safe(address, &low, &high)) - continue; + break; if (!(high & MASK_VALID_HI)) { if (block) @@ -206,10 +216,14 @@ asmlinkage void mce_threshold_interrupt(void) break; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) continue; + /* Log the machine check that caused the threshold + event. */ + do_machine_check(NULL, 0); + if (high & MASK_OVERFLOW_HI) { rdmsrl(address, m.misc); rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, @@ -385,7 +399,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; if (rdmsr_safe(address, &low, &high)) - goto recurse; + return 0; if (!(high & MASK_VALID_HI)) { if (block) @@ -394,8 +408,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, return 0; } - if (!(high & MASK_VALID_HI >> 1) || - (high & MASK_VALID_HI >> 2)) + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) goto recurse; b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9cb42ecb7f8..486f4c61a94 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -172,7 +172,7 @@ static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15; + return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return 1; @@ -214,6 +214,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static unsigned int adjust_for_32bit_ctr(unsigned int hz) +{ + unsigned int retval = hz; + + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { + retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + return retval; +} + int __init check_nmi_watchdog (void) { int *counts; @@ -268,17 +285,8 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && - ((u64)cpu_khz * 1000) > 0x7fffffffULL) { - nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + nmi_hz = adjust_for_32bit_ctr(nmi_hz); } kfree(counts); @@ -360,6 +368,33 @@ void enable_timer_nmi_watchdog(void) } } +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -634,7 +669,9 @@ static int setup_intel_arch_watchdog(void) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + nmi_hz = adjust_for_32bit_ctr(nmi_hz); + wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -855,15 +892,23 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector */ apic_write(APIC_LVTPC, APIC_DM_NMI); + /* ARCH_PERFMON has 32 bit counter writes */ + wrmsr(wd->perfctr_msr, + (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); + } else { + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, + -((u64)cpu_khz * 1000 / nmi_hz)); } - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; } else if (nmi_watchdog == NMI_IO_APIC) { /* don't know how to accurately check for this. diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 3d65b1d4c2b..04480c3b68f 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -138,6 +138,8 @@ static const unsigned long phb_debug_offsets[] = { #define PHB_DEBUG_STUFF_OFFSET 0x0020 +#define EMERGENCY_PAGES 32 /* = 128KB */ + unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; @@ -296,6 +298,16 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, { unsigned long entry; unsigned long badbit; + unsigned long badend; + + /* were we called with bad_dma_address? */ + badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + WARN_ON(1); + return; + } entry = dma_addr >> PAGE_SHIFT; @@ -656,8 +668,8 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) u64 start; struct iommu_table *tbl = dev->sysdata; - /* reserve bad_dma_address in case it's a legal address */ - iommu_range_reserve(tbl, bad_dma_address, 1); + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ start = (640 * 1024); @@ -1176,6 +1188,7 @@ int __init calgary_iommu_init(void) } force_iommu = 1; + bad_dma_address = 0x0; dma_ops = &calgary_dma_ops; return 0; diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 683b7a5c1ab..651ccfb0669 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -223,30 +223,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] - [,forcesac][,fullflush][,nomerge][,biomerge] - size set size of iommu (in bytes) - noagp don't initialize the AGP driver and use full aperture. - off don't use the IOMMU - leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) - memaper[=order] allocate an own aperture over RAM with size 32MB^order. - noforce don't force IOMMU usage. Default. - force Force IOMMU. - merge Do lazy merging. This may improve performance on some block devices. - Implies force (experimental) - biomerge Do merging at the BIO layer. This is more efficient than merge, - but should be only done with very big IOMMUs. Implies merge,force. - nomerge Don't do SG merging. - forcesac For SAC mode for masks <40bits (experimental) - fullflush Flush IOMMU on each allocation (default) - nofullflush Don't use IOMMU fullflush - allowed overwrite iommu off workarounds for specific chipsets. - soft Use software bounce buffering (default for Intel machines) - noaperture Don't touch the aperture for AGP. - allowdac Allow DMA >4GB - nodac Forbid DMA >4GB - panic Force panic when IOMMU overflows -*/ +/* + * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter + * documentation. + */ __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index fc1960f1f24..030eb375335 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -185,7 +185,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; if (force_iommu) mmu = 1; @@ -195,7 +195,7 @@ static inline int need_iommu(struct device *dev, unsigned long addr, size_t size static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { u64 mask = *dev->dma_mask; - int high = addr + size >= mask; + int high = addr + size > mask; int mmu = high; return mmu; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index addc14af0c5..4326a690a50 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -536,8 +536,12 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) } ret = 0; for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __get_user(tmp, (unsigned long __user *) data); - putreg(child, ui, tmp); + ret = __get_user(tmp, (unsigned long __user *) data); + if (ret) + break; + ret = putreg(child, ui, tmp); + if (ret) + break; data += sizeof(long); } break; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 60477244d1a..3d98b696881 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -138,128 +138,6 @@ struct resource code_resource = { .flags = IORESOURCE_RAM, }; -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; - start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -444,6 +322,11 @@ void __init setup_arch(char **cmdline_p) /* reserve ebda region */ if (ebda_addr) reserve_bootmem_generic(ebda_addr, ebda_size); +#ifdef CONFIG_NUMA + /* reserve nodemap region */ + if (nodemap_addr) + reserve_bootmem_generic(nodemap_addr, nodemap_size); +#endif #ifdef CONFIG_SMP /* @@ -519,15 +402,11 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); + * We trust e820 completely. No explicit ROM probing in memory. + */ e820_reserve_resources(); e820_mark_nosave_regions(); - request_resource(&iomem_resource, &video_ram_resource); - { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ @@ -1063,7 +942,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -1081,7 +961,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -1091,8 +971,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", NULL, "cr8_legacy", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", + "altmovcr8", "abm", "sse4a", + "misalignsse", "3dnowprefetch", + "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; @@ -1103,6 +985,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) "ttp", /* thermal trip */ "tm", "stc", + "100mhzsteps", + "hwpstate", + NULL, /* tsc invariant mapped to constant_tsc */ NULL, /* nothing */ /* constant_tsc - moved to flags */ }; @@ -1219,23 +1104,3 @@ struct seq_operations cpuinfo_op = { .stop = c_stop, .show = show_cpuinfo, }; - -#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) -#include <linux/platform_device.h> -static __init int add_pcspkr(void) -{ - struct platform_device *pd; - int ret; - - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; - - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; -} -device_initcall(add_pcspkr); -#endif diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8c4b80fe71a..6a70b55f719 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -37,7 +37,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 6026b31d037..65ac2c6b34a 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -32,7 +32,7 @@ static void save_stack_address(void *data, unsigned long addr) trace->skip--; return; } - if (trace->nr_entries < trace->max_entries - 1) + if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = addr; } @@ -49,7 +49,8 @@ static struct stacktrace_ops save_stack_ops = { void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { dump_trace(task, NULL, NULL, &save_stack_ops, trace); - trace->entries[trace->nr_entries++] = ULONG_MAX; + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 335cc91c49b..3cc6886f1fb 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -657,6 +657,7 @@ core_initcall(cpufreq_tsc); #define TICK_COUNT 100000000 #define TICK_MIN 5000 +#define MAX_READ_RETRIES 5 /* * Some platforms take periodic SMI interrupts with 5ms duration. Make sure none @@ -664,13 +665,17 @@ core_initcall(cpufreq_tsc); */ static void __init read_hpet_tsc(int *hpet, int *tsc) { - int tsc1, tsc2, hpet1; + int tsc1, tsc2, hpet1, retries = 0; + static int msg; do { tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - } while (tsc2 - tsc1 > TICK_MIN); + } while (tsc2 - tsc1 > TICK_MIN && retries++ < MAX_READ_RETRIES); + if (retries >= MAX_READ_RETRIES && !msg++) + printk(KERN_WARNING + "hpet.c: exceeded max retries to read HPET & TSC\n"); *hpet = hpet1; *tsc = tsc2; } @@ -1221,8 +1226,9 @@ static void hpet_rtc_timer_reinit(void) if (PIE_on) PIE_count += lost_ints; - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", - hpet_rtc_int_freq); + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); } } diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 6d77e4797a4..0dffae69f4a 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); @@ -34,8 +35,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); #ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +extern void __write_lock_failed(rwlock_t *rw); +extern void __read_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); #endif diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index b78d4170fce..8d5f835af48 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S new file mode 100644 index 00000000000..4620efb12f1 --- /dev/null +++ b/arch/x86_64/lib/copy_user_nocache.S @@ -0,0 +1,217 @@ +/* Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License v2. + * + * Functions to copy from and to user space. + */ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +#define FIX_ALIGNMENT 1 + +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> + +/* + * copy_user_nocache - Uncached memory copy with exception handling + * This will force destination/source out of cache for more performance. + * + * Input: + * rdi destination + * rsi source + * rdx count + * rcx zero flag when 1 zero on exception + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(__copy_user_nocache) + CFI_STARTPROC + pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx /* save zero flag */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 + + xorl %eax,%eax /* zero for the exception handler */ + +#ifdef FIX_ALIGNMENT + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jnz .Lbad_alignment +.Lafter_bad_alignment: +#endif + + movq %rdx,%rcx + + movl $64,%ebx + shrq $6,%rdx + decq %rdx + js .Lhandle_tail + + .p2align 4 +.Lloop: +.Ls1: movq (%rsi),%r11 +.Ls2: movq 1*8(%rsi),%r8 +.Ls3: movq 2*8(%rsi),%r9 +.Ls4: movq 3*8(%rsi),%r10 +.Ld1: movnti %r11,(%rdi) +.Ld2: movnti %r8,1*8(%rdi) +.Ld3: movnti %r9,2*8(%rdi) +.Ld4: movnti %r10,3*8(%rdi) + +.Ls5: movq 4*8(%rsi),%r11 +.Ls6: movq 5*8(%rsi),%r8 +.Ls7: movq 6*8(%rsi),%r9 +.Ls8: movq 7*8(%rsi),%r10 +.Ld5: movnti %r11,4*8(%rdi) +.Ld6: movnti %r8,5*8(%rdi) +.Ld7: movnti %r9,6*8(%rdi) +.Ld8: movnti %r10,7*8(%rdi) + + dec %rdx + + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + + jns .Lloop + + .p2align 4 +.Lhandle_tail: + movl %ecx,%edx + andl $63,%ecx + shrl $3,%ecx + jz .Lhandle_7 + movl $8,%ebx + .p2align 4 +.Lloop_8: +.Ls9: movq (%rsi),%r8 +.Ld9: movnti %r8,(%rdi) + decl %ecx + leaq 8(%rdi),%rdi + leaq 8(%rsi),%rsi + jnz .Lloop_8 + +.Lhandle_7: + movl %edx,%ecx + andl $7,%ecx + jz .Lende + .p2align 4 +.Lloop_1: +.Ls10: movb (%rsi),%bl +.Ld10: movb %bl,(%rdi) + incq %rdi + incq %rsi + decl %ecx + jnz .Lloop_1 + + CFI_REMEMBER_STATE +.Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE %rcx + popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx + ret + CFI_RESTORE_STATE + +#ifdef FIX_ALIGNMENT + /* align destination */ + .p2align 4 +.Lbad_alignment: + movl $8,%r9d + subl %ecx,%r9d + movl %r9d,%ecx + cmpq %r9,%rdx + jz .Lhandle_7 + js .Lhandle_7 +.Lalign_1: +.Ls11: movb (%rsi),%bl +.Ld11: movb %bl,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz .Lalign_1 + subq %r9,%rdx + jmp .Lafter_bad_alignment +#endif + + /* table sorted by exception address */ + .section __ex_table,"a" + .align 8 + .quad .Ls1,.Ls1e + .quad .Ls2,.Ls2e + .quad .Ls3,.Ls3e + .quad .Ls4,.Ls4e + .quad .Ld1,.Ls1e + .quad .Ld2,.Ls2e + .quad .Ld3,.Ls3e + .quad .Ld4,.Ls4e + .quad .Ls5,.Ls5e + .quad .Ls6,.Ls6e + .quad .Ls7,.Ls7e + .quad .Ls8,.Ls8e + .quad .Ld5,.Ls5e + .quad .Ld6,.Ls6e + .quad .Ld7,.Ls7e + .quad .Ld8,.Ls8e + .quad .Ls9,.Le_quad + .quad .Ld9,.Le_quad + .quad .Ls10,.Le_byte + .quad .Ld10,.Le_byte +#ifdef FIX_ALIGNMENT + .quad .Ls11,.Lzero_rest + .quad .Ld11,.Lzero_rest +#endif + .quad .Le5,.Le_zero + .previous + + /* compute 64-offset for main loop. 8 bytes accuracy with error on the + pessimistic side. this is gross. it would be better to fix the + interface. */ + /* eax: zero, ebx: 64 */ +.Ls1e: addl $8,%eax +.Ls2e: addl $8,%eax +.Ls3e: addl $8,%eax +.Ls4e: addl $8,%eax +.Ls5e: addl $8,%eax +.Ls6e: addl $8,%eax +.Ls7e: addl $8,%eax +.Ls8e: addl $8,%eax + addq %rbx,%rdi /* +64 */ + subq %rax,%rdi /* correct destination with computed offset */ + + shlq $6,%rdx /* loop counter * 64 (stride length) */ + addq %rax,%rdx /* add offset to loopcnt */ + andl $63,%ecx /* remaining bytes */ + addq %rcx,%rdx /* add them */ + jmp .Lzero_rest + + /* exception on quad word loop in tail handling */ + /* ecx: loopcnt/8, %edx: length, rdi: correct */ +.Le_quad: + shll $3,%ecx + andl $7,%edx + addl %ecx,%edx + /* edx: bytes to zero, rdi: dest, eax:zero */ +.Lzero_rest: + cmpl $0,(%rsp) /* zero flag set? */ + jz .Le_zero + movq %rdx,%rcx +.Le_byte: + xorl %eax,%eax +.Le5: rep + stosb + /* when there is another exception while zeroing the rest just return */ +.Le_zero: + movq %rdx,%rax + jmp .Lende + CFI_ENDPROC +ENDPROC(__copy_user_nocache) + + diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 49e8cf2e06f..6ada7231f3a 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -56,17 +56,17 @@ int unregister_page_fault_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) +static inline int notify_page_fault(struct pt_regs *regs, long err) { struct die_args args = { .regs = regs, - .str = str, + .str = "page fault", .err = err, - .trapnr = trap, - .signr = sig + .trapnr = 14, + .signr = SIGSEGV }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); + return atomic_notifier_call_chain(¬ify_page_fault_chain, + DIE_PAGE_FAULT, &args); } /* Sometimes the CPU reports invalid exceptions on prefetch. @@ -355,8 +355,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -365,8 +364,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) + if (notify_page_fault(regs, error_code) == NOTIFY_STOP) return; if (likely(regs->eflags & X86_EFLAGS_IF)) diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 2ee2e003606..41b8fb06992 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; int numa_off __initdata; +unsigned long __initdata nodemap_addr; +unsigned long __initdata nodemap_size; /* @@ -52,34 +54,88 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) int res = -1; unsigned long addr, end; - if (shift >= 64) - return -1; - memset(memnodemap, 0xff, sizeof(memnodemap)); + memset(memnodemap, 0xff, memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; if (addr >= end) continue; - if ((end >> shift) >= NODEMAPSIZE) + if ((end >> shift) >= memnodemapsize) return 0; do { if (memnodemap[addr >> shift] != 0xff) return -1; memnodemap[addr >> shift] = i; - addr += (1UL << shift); + addr += (1UL << shift); } while (addr < end); res = 1; } return res; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +static int __init allocate_cachealigned_memnodemap(void) { - int shift = 20; + unsigned long pad, pad_addr; + + memnodemap = memnode.embedded_map; + if (memnodemapsize <= 48) + return 0; + + pad = L1_CACHE_BYTES - 1; + pad_addr = 0x8000; + nodemap_size = pad + memnodemapsize; + nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, + nodemap_size); + if (nodemap_addr == -1UL) { + printk(KERN_ERR + "NUMA: Unable to allocate Memory to Node hash map\n"); + nodemap_addr = nodemap_size = 0; + return -1; + } + pad_addr = (nodemap_addr + pad) & ~pad; + memnodemap = phys_to_virt(pad_addr); + + printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", + nodemap_addr, nodemap_addr + nodemap_size); + return 0; +} - while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0) - shift++; +/* + * The LSB of all start and end addresses in the node map is the value of the + * maximum possible shift. + */ +static int __init +extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +{ + int i, nodes_used = 0; + unsigned long start, end; + unsigned long bitfield = 0, memtop = 0; + + for (i = 0; i < numnodes; i++) { + start = nodes[i].start; + end = nodes[i].end; + if (start >= end) + continue; + bitfield |= start; + nodes_used++; + if (end > memtop) + memtop = end; + } + if (nodes_used <= 1) + i = 63; + else + i = find_first_bit(&bitfield, sizeof(unsigned long)*8); + memnodemapsize = (memtop >> i)+1; + return i; +} + +int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +{ + int shift; + shift = extract_lsb_from_nodes(nodes, numnodes); + if (allocate_cachealigned_memnodemap()) + return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); @@ -216,31 +272,113 @@ void __init numa_init_array(void) } #ifdef CONFIG_NUMA_EMU +/* Numa emulation */ int numa_fake __initdata = 0; -/* Numa emulation */ +/* + * This function is used to find out if the start and end correspond to + * different zones. + */ +int zone_cross_over(unsigned long start, unsigned long end) +{ + if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && + (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) + return 1; + return 0; +} + static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - int i; + int i, big; struct bootnode nodes[MAX_NUMNODES]; - unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake; + unsigned long sz, old_sz; + unsigned long hole_size; + unsigned long start, end; + unsigned long max_addr = (end_pfn << PAGE_SHIFT); + + start = (start_pfn << PAGE_SHIFT); + hole_size = e820_hole_size(start, max_addr); + sz = (max_addr - start - hole_size) / numa_fake; /* Kludge needed for the hash function */ - if (hweight64(sz) > 1) { - unsigned long x = 1; - while ((x << 1) < sz) - x <<= 1; - if (x < sz/2) - printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n"); - sz = x; - } + old_sz = sz; + /* + * Round down to the nearest FAKE_NODE_MIN_SIZE. + */ + sz &= FAKE_NODE_MIN_HASH_MASK; + + /* + * We ensure that each node is at least 64MB big. Smaller than this + * size can cause VM hiccups. + */ + if (sz == 0) { + printk(KERN_INFO "Not enough memory for %d nodes. Reducing " + "the number of nodes\n", numa_fake); + numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Number of fake nodes will be = %d\n", + numa_fake); + sz = FAKE_NODE_MIN_SIZE; + } + /* + * Find out how many nodes can get an extra NODE_MIN_SIZE granule. + * This logic ensures the extra memory gets distributed among as many + * nodes as possible (as compared to one single node getting all that + * extra memory. + */ + big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE; + printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: " + "%d\n", + (sz >> 20), (hole_size >> 20), big); memset(&nodes,0,sizeof(nodes)); + end = start; for (i = 0; i < numa_fake; i++) { - nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz; + /* + * In case we are not able to allocate enough memory for all + * the nodes, we reduce the number of fake nodes. + */ + if (end >= max_addr) { + numa_fake = i - 1; + break; + } + start = nodes[i].start = end; + /* + * Final node can have all the remaining memory. + */ if (i == numa_fake-1) - sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start; - nodes[i].end = nodes[i].start + sz; + sz = max_addr - start; + end = nodes[i].start + sz; + /* + * Fir "big" number of nodes get extra granule. + */ + if (i < big) + end += FAKE_NODE_MIN_SIZE; + /* + * Iterate over the range to ensure that this node gets at + * least sz amount of RAM (excluding holes) + */ + while ((end - start - e820_hole_size(start, end)) < sz) { + end += FAKE_NODE_MIN_SIZE; + if (end >= max_addr) + break; + } + /* + * Look at the next node to make sure there is some real memory + * to map. Bad things happen when the only memory present + * in a zone on a fake node is IO hole. + */ + while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { + if (zone_cross_over(start, end + sz)) { + end = (MAX_DMA32_PFN << PAGE_SHIFT); + break; + } + if (end >= max_addr) + break; + end += FAKE_NODE_MIN_SIZE; + } + if (end > max_addr) + end = max_addr; + nodes[i].end = end; printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", i, nodes[i].start, nodes[i].end, @@ -290,6 +428,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) end_pfn << PAGE_SHIFT); /* setup dummy node covering all memory */ memnode_shift = 63; + memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); node_set_online(0); @@ -321,20 +460,6 @@ unsigned long __init numa_free_all_bootmem(void) return pages; } -#ifdef CONFIG_SPARSEMEM -static void __init arch_sparse_init(void) -{ - int i; - - for_each_online_node(i) - memory_present(i, node_start_pfn(i), node_end_pfn(i)); - - sparse_init(); -} -#else -#define arch_sparse_init() do {} while (0) -#endif - void __init paging_init(void) { int i; @@ -344,7 +469,8 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = end_pfn; - arch_sparse_init(); + sparse_memory_present_with_active_regions(MAX_NUMNODES); + sparse_init(); for_each_online_node(i) { setup_node_zones(i); diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index ccb91dd996a..65c5eaa5990 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -107,6 +107,7 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) pud_t *pud; pmd_t *pmd; pte_t large_pte; + unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -114,7 +115,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; + large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index 149aba05a5b..c9eddc8859c 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -11,7 +11,7 @@ obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_NUMA) += k8-bus.o @@ -24,3 +24,4 @@ fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o early-y += ../../i386/pci/early.o +mmconfig-shared-y += ../../i386/pci/mmconfig-shared.o diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index faabb6e87f1..65d82736987 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -13,16 +13,6 @@ #include "pci.h" -/* aperture is up to 256MB but BIOS may reserve less */ -#define MMCONFIG_APER_MIN (2 * 1024*1024) -#define MMCONFIG_APER_MAX (256 * 1024*1024) - -/* Verify the first 16 busses. We assume that systems with more busses - get MCFG right. */ -#define MAX_CHECK_BUS 16 - -static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS); - /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { struct acpi_mcfg_allocation *cfg; @@ -32,30 +22,17 @@ static struct mmcfg_virt *pci_mmcfg_virt; static char __iomem *get_virt(unsigned int seg, unsigned bus) { - int cfg_num = -1; struct acpi_mcfg_allocation *cfg; + int cfg_num; - while (1) { - ++cfg_num; - if (cfg_num >= pci_mmcfg_config_num) - break; + for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) { cfg = pci_mmcfg_virt[cfg_num].cfg; - if (cfg->pci_segment != seg) - continue; - if ((cfg->start_bus_number <= bus) && + if (cfg->pci_segment == seg && + (cfg->start_bus_number <= bus) && (cfg->end_bus_number >= bus)) return pci_mmcfg_virt[cfg_num].virt; } - /* Handle more broken MCFG tables on Asus etc. - They only contain a single entry for bus 0-0. Assume - this applies to all busses. */ - cfg = &pci_mmcfg_config[0]; - if (pci_mmcfg_config_num == 1 && - cfg->pci_segment == 0 && - (cfg->start_bus_number | cfg->end_bus_number) == 0) - return pci_mmcfg_virt[0].virt; - /* Fall back to type 0 */ return NULL; } @@ -63,8 +40,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus < MAX_CHECK_BUS && - test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < PCI_MMCFG_MAX_CHECK_BUS && + test_bit(32*bus + PCI_SLOT(devfn), pci_mmcfg_fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -135,79 +112,46 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; -/* K8 systems have some devices (typically in the builtin northbridge) - that are only accessible using type1 - Normally this can be expressed in the MCFG by not listing them - and assigning suitable _SEGs, but this isn't implemented in some BIOS. - Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. */ -static __init void unreachable_devices(void) +static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) { - int i, k; - /* Use the max bus number from ACPI here? */ - for (k = 0; k < MAX_CHECK_BUS; k++) { - for (i = 0; i < 32; i++) { - u32 val1; - char __iomem *addr; - - pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); - if (addr == NULL|| readl(addr) != val1) { - set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE "PCI: No mmconfig possible" - " on device %02x:%02x\n", k, i); - } - } + void __iomem *addr; + u32 size; + + size = (cfg->end_bus_number + 1) << 20; + addr = ioremap_nocache(cfg->address, size); + if (addr) { + printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", + cfg->address, cfg->address + size - 1); } + return addr; } -void __init pci_mmcfg_init(int type) +int __init pci_mmcfg_arch_reachable(unsigned int seg, unsigned int bus, + unsigned int devfn) { - int i; - - if ((pci_probe & PCI_PROBE_MMCONF) == 0) - return; - - acpi_table_parse(ACPI_SIG_MCFG, acpi_parse_mcfg); - if ((pci_mmcfg_config_num == 0) || - (pci_mmcfg_config == NULL) || - (pci_mmcfg_config[0].address == 0)) - return; - - /* Only do this check when type 1 works. If it doesn't work - assume we run on a Mac and always use MCFG */ - if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].address, - pci_mmcfg_config[0].address + MMCONFIG_APER_MIN, - E820_RESERVED)) { - printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %lx is not E820-reserved\n", - (unsigned long)pci_mmcfg_config[0].address); - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); - return; - } + return pci_dev_base(seg, bus, devfn) != NULL; +} - pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); +int __init pci_mmcfg_arch_init(void) +{ + int i; + pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * + pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n"); - return; + return 0; } + for (i = 0; i < pci_mmcfg_config_num; ++i) { pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; - pci_mmcfg_virt[i].virt = ioremap_nocache(pci_mmcfg_config[i].address, - MMCONFIG_APER_MAX); + pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); if (!pci_mmcfg_virt[i].virt) { printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " "segment %d\n", pci_mmcfg_config[i].pci_segment); - return; + return 0; } - printk(KERN_INFO "PCI: Using MMCONFIG at %lx\n", - (unsigned long)pci_mmcfg_config[i].address); } - - unreachable_devices(); - raw_pci_ops = &pci_mmcfg; - pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + return 1; } |